def main(): main_params = convert_squad.get_main_params() model_name = C.LM_QUESTION_ANSWERS_REVIEWS params = config.get_model_params(model_name) params[C.MODEL_NAME] = model_name model_name = C.LM_QUESTION_ANSWERS_REVIEWS params = config.get_model_params(model_name) cat_files(params[C.CATEGORY], main_params.mode, main_params.max_review_len, main_params.max_num_spans, main_params.max_num_products, main_params.seed, main_params.num_processes)
def main(): answer_span_lens = range(2, 10) main_params = get_main_params() seed = main_params.seed np.random.seed(seed) max_review_len = main_params.max_review_len max_num_spans = main_params.max_num_spans max_num_products = main_params.max_num_products model_name = C.LM_QUESTION_ANSWERS_REVIEWS params = config.get_model_params(model_name) params[C.MODEL_NAME] = model_name logfilename = '%s/%d.log' % (TEMPFILEPATH, main_params.process_idx) with open(logfilename, 'w') as fp: fp.write('') def log(line): with open(logfilename, 'a') as fp: fp.write(line + '\n') if not os.path.exists(TEMPFILEPATH): os.makedirs(TEMPFILEPATH) params[C.REVIEW_SELECT_MODE] = C.BM25 dataset = AmazonDataset(params) path = { C.TRAIN_TYPE: dataset.train_path, C.DEV_TYPE: dataset.val_path, C.TEST_TYPE: dataset.test_path, }[main_params.mode] dataset.save_data( main_params.process_idx, main_params.num_processes, max_num_products, path, max_review_len, answer_span_lens, max_num_spans, log, process_filepath( params[C.CATEGORY], main_params.mode, max_review_len, max_num_spans, seed, main_params.process_idx, ), ) with open('%s/all_processes.log' % TEMPFILEPATH, 'a') as fp: fp.write('Finished process: %d / %d\n' % (main_params.process_idx, main_params.num_processes))
def main(): model_name = C.LM_QUESTION_ANSWERS_REVIEWS params = config.get_model_params(model_name) params[C.MODEL_NAME] = model_name dataset = AmazonDataset(params) path = dataset.test_path assert os.path.exists(path) with open(path, 'rb') as f: dataFrame = pd.read_pickle(f) if DEBUG: dataFrame = dataFrame.iloc[:5] q_counts = [] for (_, row) in dataFrame.iterrows(): q_counts.append(len(row[C.QUESTIONS_LIST])) print(np.mean(q_counts), np.std(q_counts), len(q_counts))
def main(): seed = 1 if MTURK: num_entries = 10000 else: num_entries = 400 max_review_len = 50 typestr = 'mturk_' if MTURK else '' np.random.seed(seed) model_name = C.LM_QUESTION_ANSWERS_REVIEWS params = config.get_model_params(model_name) params[C.MODEL_NAME] = model_name params[C.REVIEW_SELECT_MODE] = C.BM25 dataset = AmazonDataset(params) path = dataset.test_path dataset.save_data( dataset.test_path, num_entries, max_review_len=max_review_len, filename='is_answerable_%s_%ssamples_%d_%d_%d.csv' % (params[C.CATEGORY], typestr, num_entries, max_review_len, seed) )
def main(): _set_random_seeds(RANDOM_SEED) args = config.get_main_params() model_name, mode = args.model_name, args.mode save_dir = args.save_dir resume, epoch = args.resume, args.epoch if args.resume: assert mode == C.TRAIN_TYPE assert epoch >= 0 params = config.get_model_params(model_name) params[C.MODEL_NAME] = model_name # Instantiate saver and a logger in save_dir # If save_dir is passed in from command line # params are loaded from the save_dir # Logger is instantiated in saver saver = Saver(save_dir, params) logger = saver.logger params = saver.params # if save_dir is passed, # model_name is used from the model_name in saved params model_name = params[C.MODEL_NAME] logger.log('SaveDir: %s' % saver.save_dir) if mode == C.TRAIN_TYPE: logger.log('\nLoading dataset..') dataset = AmazonDataset(params, mode) logger.log('\n Model: %s, Mode = %s \n' % (model_name, mode)) logger.log('\nLoading dataloader..') if CACHE_DATASET: train_loader = pickle.load(open(model_name + 'train.pickle', 'rb')) dev_loader = pickle.load(open(model_name + 'dev.pickle', 'rb')) else: train_loader = AmazonDataLoader(dataset.train, model_name, params[C.BATCH_SIZE]) dev_loader = AmazonDataLoader(dataset.val, model_name, params[C.BATCH_SIZE]) pickle.dump(train_loader, open(model_name + 'train.pickle', 'wb')) pickle.dump(dev_loader, open(model_name + 'dev.pickle', 'wb')) logger.log('\nInstantiating training..') trainer = Trainer(train_loader, params, dev_loader=dev_loader, vocab=dataset.vocab, saver=saver, resume_training=resume, resume_epoch=epoch if resume else None) trainer.train() elif mode in [C.DEV_TYPE, C.TEST_TYPE]: logger.log('\nBeginning evaluation ..\n') # Load saved params and vocabs # output_file = args.output_file output_file = "./saved/output_test.json" logger.log('Loading vocab..') vocab = saver.load_vocab() model_name = params[C.MODEL_NAME] dataset = AmazonDataset(params, mode) #TODO: next line is a temporary change only. dataset_typed = dataset.test #dataset_typed = dataset.val if mode == C.DEV_TYPE else dataset.test loader = AmazonDataLoader(dataset_typed, model_name, params[C.BATCH_SIZE]) # Load model logger.log('Loading saved model..') model = Seq2Seq( vocab.get_vocab_size(), hsizes(params, model_name), params, ) saver.load_model(epoch, model) # Instantiate trainer with saved model logger.log('Instantiating trainer..') trainer = Trainer(None, params, dev_loader=loader, saver=saver, vocab=vocab) logger.log('Adding model to trainer..') trainer.model = model # Evaluation on test set logger.log('Total number of [%s] batches: %d' % (mode.upper(), len(list(loader)))) trainer.eval(loader, mode, output_filename=output_file) logger.log('\nCompleted Evaluation..\n') else: raise 'Unimplemented mode: %s' % mode
P.generate_split_data(category) if __name__ == "__main__": # parse arguments parser = argparse.ArgumentParser( description="Test AmazonDataset and AmazonDataLoader") parser.add_argument("--model_name", type=str, default='LM_A') parser.add_argument("--category", type=str, default='Dummy') parser.add_argument("--max_question_len", type=int, default=100) parser.add_argument("--max_answer_len", type=int, default=200) parser.add_argument("--max_review_len", type=int, default=300) args, _ = parser.parse_known_args() model_name = args.model_name params = config.get_model_params(model_name) params[C.CATEGORY] = args.category #preprocess_data(params[C.CATEGORY]) dataset = AmazonDataset(params) answersDict, questionsDict, questionAnswersDict, reviewsDict, data = dataset.test print(answersDict) print(questionsDict) print(questionAnswersDict) test_loader = AmazonDataLoader(dataset.test, model_name, params[C.BATCH_SIZE]) #print_dataframe(params[C.CATEGORY], 'test') for batch_itr, inputs in enumerate(tqdm(test_loader)):