コード例 #1
0
def main():
    main_params = convert_squad.get_main_params()
    model_name = C.LM_QUESTION_ANSWERS_REVIEWS
    params = config.get_model_params(model_name)
    params[C.MODEL_NAME] = model_name

    model_name = C.LM_QUESTION_ANSWERS_REVIEWS
    params = config.get_model_params(model_name)
    cat_files(params[C.CATEGORY], main_params.mode, main_params.max_review_len,
              main_params.max_num_spans, main_params.max_num_products,
              main_params.seed, main_params.num_processes)
def main():
    answer_span_lens = range(2, 10)

    main_params = get_main_params()
    seed = main_params.seed
    np.random.seed(seed)

    max_review_len = main_params.max_review_len
    max_num_spans = main_params.max_num_spans
    max_num_products = main_params.max_num_products

    model_name = C.LM_QUESTION_ANSWERS_REVIEWS
    params = config.get_model_params(model_name)
    params[C.MODEL_NAME] = model_name

    logfilename = '%s/%d.log' % (TEMPFILEPATH, main_params.process_idx)
    with open(logfilename, 'w') as fp:
        fp.write('')

    def log(line):
        with open(logfilename, 'a') as fp:
            fp.write(line + '\n')

    if not os.path.exists(TEMPFILEPATH):
        os.makedirs(TEMPFILEPATH)

    params[C.REVIEW_SELECT_MODE] = C.BM25
    dataset = AmazonDataset(params)
    path = {
        C.TRAIN_TYPE: dataset.train_path,
        C.DEV_TYPE: dataset.val_path,
        C.TEST_TYPE: dataset.test_path,
    }[main_params.mode]

    dataset.save_data(
        main_params.process_idx,
        main_params.num_processes,
        max_num_products,
        path,
        max_review_len,
        answer_span_lens,
        max_num_spans,
        log,
        process_filepath(
            params[C.CATEGORY],
            main_params.mode,
            max_review_len,
            max_num_spans,
            seed,
            main_params.process_idx,
        ),
    )
    with open('%s/all_processes.log' % TEMPFILEPATH, 'a') as fp:
        fp.write('Finished process: %d / %d\n' %
                 (main_params.process_idx, main_params.num_processes))
def main():
    model_name = C.LM_QUESTION_ANSWERS_REVIEWS
    params = config.get_model_params(model_name)
    params[C.MODEL_NAME] = model_name

    dataset = AmazonDataset(params)
    path = dataset.test_path

    assert os.path.exists(path)

    with open(path, 'rb') as f:
        dataFrame = pd.read_pickle(f)
        if DEBUG:
            dataFrame = dataFrame.iloc[:5]

    q_counts = []
    for (_, row) in dataFrame.iterrows():
        q_counts.append(len(row[C.QUESTIONS_LIST]))

    print(np.mean(q_counts), np.std(q_counts), len(q_counts))
def main():
	seed = 1
	if MTURK:
		num_entries = 10000
	else:
		num_entries = 400
	max_review_len = 50
	typestr = 'mturk_' if MTURK else ''
	np.random.seed(seed)
	model_name = C.LM_QUESTION_ANSWERS_REVIEWS
	params = config.get_model_params(model_name)
	params[C.MODEL_NAME] = model_name

	params[C.REVIEW_SELECT_MODE] = C.BM25
	dataset = AmazonDataset(params)
	path = dataset.test_path
	dataset.save_data(
		dataset.test_path,
		num_entries,
		max_review_len=max_review_len,
		filename='is_answerable_%s_%ssamples_%d_%d_%d.csv' % (params[C.CATEGORY], typestr, num_entries, max_review_len, seed)
	)
コード例 #5
0
ファイル: main.py プロジェクト: hhhhzy/amazonqa
def main():
    _set_random_seeds(RANDOM_SEED)
    args = config.get_main_params()
    model_name, mode = args.model_name, args.mode
    save_dir = args.save_dir

    resume, epoch = args.resume, args.epoch

    if args.resume:
        assert mode == C.TRAIN_TYPE
        assert epoch >= 0

    params = config.get_model_params(model_name)
    params[C.MODEL_NAME] = model_name

    # Instantiate saver and a logger in save_dir
    # If save_dir is passed in from command line
    #   params are loaded from the save_dir
    # Logger is instantiated in saver
    saver = Saver(save_dir, params)
    logger = saver.logger
    params = saver.params

    # if save_dir is passed,
    # model_name is used from the model_name in saved params
    model_name = params[C.MODEL_NAME]
    logger.log('SaveDir: %s' % saver.save_dir)

    if mode == C.TRAIN_TYPE:
        logger.log('\nLoading dataset..')
        dataset = AmazonDataset(params, mode)
        logger.log('\n Model: %s, Mode = %s \n' % (model_name, mode))
        logger.log('\nLoading dataloader..')

        if CACHE_DATASET:
            train_loader = pickle.load(open(model_name + 'train.pickle', 'rb'))
            dev_loader = pickle.load(open(model_name + 'dev.pickle', 'rb'))
        else:
            train_loader = AmazonDataLoader(dataset.train, model_name,
                                            params[C.BATCH_SIZE])
            dev_loader = AmazonDataLoader(dataset.val, model_name,
                                          params[C.BATCH_SIZE])
            pickle.dump(train_loader, open(model_name + 'train.pickle', 'wb'))
            pickle.dump(dev_loader, open(model_name + 'dev.pickle', 'wb'))

        logger.log('\nInstantiating training..')
        trainer = Trainer(train_loader,
                          params,
                          dev_loader=dev_loader,
                          vocab=dataset.vocab,
                          saver=saver,
                          resume_training=resume,
                          resume_epoch=epoch if resume else None)
        trainer.train()

    elif mode in [C.DEV_TYPE, C.TEST_TYPE]:
        logger.log('\nBeginning evaluation ..\n')

        # Load saved params and vocabs
        #         output_file = args.output_file
        output_file = "./saved/output_test.json"
        logger.log('Loading vocab..')
        vocab = saver.load_vocab()
        model_name = params[C.MODEL_NAME]
        dataset = AmazonDataset(params, mode)
        #TODO: next line is a temporary change only.
        dataset_typed = dataset.test
        #dataset_typed = dataset.val if mode == C.DEV_TYPE else dataset.test
        loader = AmazonDataLoader(dataset_typed, model_name,
                                  params[C.BATCH_SIZE])

        # Load model
        logger.log('Loading saved model..')
        model = Seq2Seq(
            vocab.get_vocab_size(),
            hsizes(params, model_name),
            params,
        )
        saver.load_model(epoch, model)

        # Instantiate trainer with saved model
        logger.log('Instantiating trainer..')
        trainer = Trainer(None,
                          params,
                          dev_loader=loader,
                          saver=saver,
                          vocab=vocab)
        logger.log('Adding model to trainer..')
        trainer.model = model

        # Evaluation on test set
        logger.log('Total number of [%s] batches: %d' %
                   (mode.upper(), len(list(loader))))
        trainer.eval(loader, mode, output_filename=output_file)

        logger.log('\nCompleted Evaluation..\n')
    else:
        raise 'Unimplemented mode: %s' % mode
コード例 #6
0
ファイル: test_data_preprocess.py プロジェクト: yyht/amazonqa
    P.generate_split_data(category)


if __name__ == "__main__":
    # parse arguments
    parser = argparse.ArgumentParser(
        description="Test AmazonDataset and AmazonDataLoader")
    parser.add_argument("--model_name", type=str, default='LM_A')
    parser.add_argument("--category", type=str, default='Dummy')
    parser.add_argument("--max_question_len", type=int, default=100)
    parser.add_argument("--max_answer_len", type=int, default=200)
    parser.add_argument("--max_review_len", type=int, default=300)
    args, _ = parser.parse_known_args()

    model_name = args.model_name
    params = config.get_model_params(model_name)
    params[C.CATEGORY] = args.category

    #preprocess_data(params[C.CATEGORY])

    dataset = AmazonDataset(params)
    answersDict, questionsDict, questionAnswersDict, reviewsDict, data = dataset.test
    print(answersDict)
    print(questionsDict)
    print(questionAnswersDict)

    test_loader = AmazonDataLoader(dataset.test, model_name,
                                   params[C.BATCH_SIZE])
    #print_dataframe(params[C.CATEGORY], 'test')

    for batch_itr, inputs in enumerate(tqdm(test_loader)):