def load_data_and_vocab(opt, load_train=True): # load vocab word2idx, idx2word, vocab = load_vocab(opt) # constructor data loader logging.info("Loading train and validate data from '%s'" % opt.data) if load_train: # load training dataset # load one2many train-dataset & valid-dataset if not opt.custom_data_filename_suffix: train_one2many = torch.load(opt.data + '/train.one2many.pt', 'wb') else: train_one2many = torch.load(opt.data + '/train.one2many.%s.pt' % opt.data_filename_suffix, 'wb') train_one2many_dataset = KeyphraseDataset(train_one2many, word2idx=word2idx, idx2word=idx2word, delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos) train_loader = DataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=False, shuffle=True) logging.info('#(train data size: #(batch)=%d' % (len(train_loader))) if not opt.custom_data_filename_suffix: valid_one2many = torch.load(opt.data + '/valid.one2many.pt', 'wb') else: valid_one2many = torch.load(opt.data + '/valid.one2many.%s.pt' % opt.data_filename_suffix, 'wb') valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2idx=word2idx, idx2word=idx2word, delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos) valid_loader = DataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=False, shuffle=False) logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader))) return train_loader, valid_loader, word2idx, idx2word, vocab else: if not opt.custom_data_filename_suffix: test_one2many = torch.load(opt.data + '/test.one2many.pt', 'wb') else: test_one2many = torch.load(opt.data + '/test.one2many.%s.pt' % opt.data_filename_suffix, 'wb') test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word, delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos) test_loader = DataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False) logging.info('#(test data size: #(batch)=%d' % (len(test_loader))) return test_loader, word2idx, idx2word, vocab
def process(self, input_str, top_n=8): one2one, one2many = self.preprocess_input(input_str) # test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets(self.opt,one2one,one2many) pin_memory = torch.cuda.is_available() testset_name = 'kp20k' logger.info("Loading test dataset %s" % testset_name) # testset_path = os.path.join(opt.test_dataset_root_path, testset_name, testset_name + '.test.one2many.pt') # test_one2many = torch.load(testset_path, 'wb') test_one2many_dataset = KeyphraseDataset( one2many, word2id=self.model_opts.word2id, id2word=self.model_opts.id2word, type='one2many', include_original=True) test_one2many_loader = KeyphraseDataLoader( dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=self.model_opts.batch_workers, max_batch_example=self.model_opts.beam_search_batch_example, max_batch_pair=self.model_opts.beam_search_batch_size, pin_memory=pin_memory, shuffle=False) # test_one2many_loaders = [test_one2many_loader] # for testset_name, test_data_loader in zip(['kp20k'], test_one2many_loaders): # test_data_loader = test_one2many_loader logger.info('Evaluating %s' % testset_name) output = predict_beam_search( self.generator, test_one2many_loader, self.model_opts, title='test_%s' % testset_name, predict_save_path=None ) #opt.pred_path + '/%s_test_result/' % (testset_name)) return output[:top_n]
def load_vocab_and_testsets(opt): logger.info("Loading vocab from disk: %s" % (opt.vocab)) word2id, id2word, vocab = torch.load(opt.vocab, 'rb') opt.word2id = word2id opt.id2word = id2word opt.vocab = vocab if not opt.decode_old: opt.vocab_size = len(word2id) logger.info('#(vocab)=%d' % len(word2id)) logger.info('#(vocab used)=%d' % len(word2id)) pin_memory = torch.cuda.is_available() and opt.useGpu test_one2many_loaders = [] for testset_name in opt.test_dataset_names: logger.info("Loading test dataset %s" % testset_name) print("test_dataset_names") print(opt.test_dataset_names) print("testset_name") print(testset_name) print() testset_path = os.path.join(opt.test_dataset_root_path, testset_name, testset_name + '.test.one2many.pt') test_one2many = torch.load(testset_path, 'wb') test_one2many_dataset = KeyphraseDataset(test_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True) test_one2many_loader = KeyphraseDataLoader( dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many if opt.useCLF else test_one2many_dataset.collate_fn_one2many_noBeginEnd, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=pin_memory, shuffle=False) test_one2many_loaders.append(test_one2many_loader) logger.info( '#(test data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(test_one2many_loader.dataset), test_one2many_loader.one2one_number(), len(test_one2many_loader))) logger.info('*' * 50) return test_one2many_loaders, word2id, id2word, vocab
def build_test_dataset(opt): # load vocab word2idx, idx2word, vocab = load_vocab(opt) # load data # read tokenized text file and convert them to 2d list of words tokenized_src = read_tokenized_src_file(opt.src_file, remove_eos=opt.remove_title_eos, title_guided=False) test_one2many = build_interactive_predict_dataset(tokenized_src, word2idx, opt) # build the data loader test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word, delimiter_type=opt.delimiter_type, load_train=False, remove_src_eos=opt.remove_src_eos) test_loader = DataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False) return test_loader
def main(opt): # load vocab word2idx, idx2word, vocab = load_vocab(opt) # load data # read tokenized text file and convert them to 2d list of words src_file = opt.src_file #trg_file = opt.trg_file #tokenized_train_pairs = read_src_and_trg_files(src_file, trg_file, is_train=False, remove_eos=opt.remove_title_eos) # 2d list of word if opt.title_guided: tokenized_src, tokenized_title = read_tokenized_src_file( src_file, remove_eos=opt.remove_title_eos, title_guided=True) else: tokenized_src = read_tokenized_src_file( src_file, remove_eos=opt.remove_title_eos, title_guided=False) tokenized_title = None # convert the 2d list of words to a list of dictionary, with keys 'src', 'src_oov', 'trg', 'trg_copy', 'src_str', 'trg_str', 'oov_dict', 'oov_list' # since we don't need the targets during testing, 'trg' and 'trg_copy' are some dummy variables #test_one2many = build_dataset(tokenized_train_pairs, word2idx, idx2word, opt, mode="one2many", include_original=True) test_one2many = build_interactive_predict_dataset(tokenized_src, word2idx, idx2word, opt, tokenized_title) # build the data loader test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word, type='one2many', delimiter_type=opt.delimiter_type, load_train=False, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) test_loader = DataLoader( dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False) # init the pretrained model model = predict.init_pretrained_model(opt) # Print out predict path print("Prediction path: %s" % opt.pred_path) # predict the keyphrases of the src file and output it to opt.pred_path/predictions.txt predict.predict(test_loader, model, opt)
def load_data_vocab(opt, load_train=True): logging.info("Loading vocab from disk: %s" % (opt.vocab)) word2id, id2word, vocab = torch.load(opt.vocab, 'wb') # one2one data loader logging.info("Loading train and validate data from '%s'" % opt.data) ''' train_one2one = torch.load(opt.data + '.train.one2one.pt', 'wb') valid_one2one = torch.load(opt.data + '.valid.one2one.pt', 'wb') train_one2one_dataset = KeyphraseDataset(train_one2one, word2id=word2id) valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2id=word2id) train_one2one_loader = DataLoader(dataset=train_one2one_dataset, collate_fn=train_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=True) valid_one2one_loader = DataLoader(dataset=valid_one2one_dataset, collate_fn=valid_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False) ''' logging.info('====================== Dataset =========================') # one2many data loader if load_train: train_one2many = torch.load(opt.data + '.train.one2many.pt', 'wb') train_one2many_dataset = KeyphraseDataset(train_one2many, word2id=word2id, id2word=id2word, type='one2many') train_one2many_loader = KeyphraseDataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=1024, max_batch_pair=opt.batch_size, pin_memory=True, shuffle=True) logging.info('#(train data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d, #(average examples/batch)=%.3f' % (len(train_one2many_loader.dataset), train_one2many_loader.one2one_number(), len(train_one2many_loader), train_one2many_loader.one2one_number() / len(train_one2many_loader))) else: train_one2many_loader = None valid_one2many = torch.load(opt.data + '.valid.one2many.pt', 'wb') test_one2many = torch.load(opt.data + '.test.one2many.pt', 'wb') # !important. As it takes too long to do beam search, thus reduce the size of validation and test datasets valid_one2many = valid_one2many[:2000] test_one2many = test_one2many[:2000] valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True) test_one2many_dataset = KeyphraseDataset(test_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True) """ # temporary code, exporting test data for Theano model for e_id, e in enumerate(test_one2many_dataset.examples): with open(os.path.join('data', 'new_kp20k_for_theano_model', 'text', '%d.txt' % e_id), 'w') as t_file: t_file.write(' '.join(e['src_str'])) with open(os.path.join('data', 'new_kp20k_for_theano_model', 'keyphrase', '%d.txt' % e_id), 'w') as t_file: t_file.writelines([(' '.join(t))+'\n' for t in e['trg_str']]) exit() """ valid_one2many_loader = KeyphraseDataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False) test_one2many_loader = KeyphraseDataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False) opt.word2id = word2id opt.id2word = id2word opt.vocab = vocab logging.info('#(valid data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(valid_one2many_loader.dataset), valid_one2many_loader.one2one_number(), len(valid_one2many_loader))) logging.info('#(test data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(test_one2many_loader.dataset), test_one2many_loader.one2one_number(), len(test_one2many_loader))) logging.info('#(vocab)=%d' % len(vocab)) logging.info('#(vocab used)=%d' % opt.vocab_size) return train_one2many_loader, valid_one2many_loader, test_one2many_loader, word2id, id2word, vocab
def load_data_and_vocab(opt, load_train=True): # load vocab #print(opt) word2idx, idx2word, vocab = load_vocab() # constructor data loader logging.info("Loading train and validate data from '%s'" % data_file) if load_train: # load training dataset if not opt.one2many: # load one2one dataset if not opt.custom_data_filename_suffix: train_one2one = torch.load(data_file+ '/train.one2one.pt', 'wb') else: train_one2one = torch.load(data_file+ '/train.one2one.%s.pt' % data_filename_suffix, 'wb') train_one2one_dataset = KeyphraseDataset(train_one2one, word2idx=word2idx, idx2word=idx2word, type='one2one', load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) train_loader = DataLoader(dataset=train_one2one_dataset, collate_fn=train_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=True) logging.info('#(train data size: #(batch)=%d' % (len(train_loader))) if not opt.custom_data_filename_suffix: valid_one2one = torch.load(data_file + '/valid.one2one.pt', 'wb') else: valid_one2one = torch.load(data_file + '/valid.one2one.%s.pt' % data_filename_suffix, 'wb') valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2idx=word2idx, idx2word=idx2word, type='one2one', load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) valid_loader = DataLoader(dataset=valid_one2one_dataset, collate_fn=valid_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True, shuffle=False) logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader))) else: # load one2many dataset if not opt.custom_data_filename_suffix: train_one2many = torch.load(data_file + '/train.one2many.pt', 'wb') else: train_one2many = torch.load(data_file + '/train.one2many.%s.pt' % data_filename_suffix, 'wb') train_one2many_dataset = KeyphraseDataset(train_one2many, word2idx=word2idx, idx2word=idx2word, type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) train_loader = DataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True, shuffle=True) logging.info('#(train data size: #(batch)=%d' % (len(train_loader))) if not opt.custom_data_filename_suffix: valid_one2many = torch.load(data_file + '/valid.one2many.pt', 'wb') else: valid_one2many = torch.load(data_file + '/valid.one2many.%s.pt' % data_filename_suffix, 'wb') #valid_one2many = valid_one2many[:2000] valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2idx=word2idx, idx2word=idx2word, type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) valid_loader = DataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True, shuffle=False) logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader))) return train_loader, valid_loader, word2idx, idx2word, vocab else: if not opt.custom_data_filename_suffix: test_one2many = torch.load(data_file+ '/test.one2many.pt', 'wb') else: test_one2many = torch.load(data_file + '/test.one2many.%s.pt' % data_filename_suffix, 'wb') test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word, type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided) test_loader = DataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True, shuffle=False) logging.info('#(test data size: #(batch)=%d' % (len(test_loader))) return test_loader, word2idx, idx2word, vocab
def load_vocab_and_datasets_for_testing(dataset_names, type, opt): ''' Load additional datasets from disk For now seven datasets are included: 'inspec', 'nus', 'semeval', 'krapivin', 'kp20k', 'duc', 'stackexchange' Only 'kp20k', 'stackexchange' provide train/valid/test data. The others have only train/test, and the train is mostly used for validation. :param type: :param opt: :return: ''' assert type == 'test' or type == 'valid' logger.info("Loading vocab from disk: %s" % (opt.vocab_path)) word2id, id2word, vocab = torch.load(opt.vocab_path, 'rb') logger.info('#(vocab)=%d' % len(vocab)) pin_memory = torch.cuda.is_available() one2many_loaders = [] for dataset_name in dataset_names: logger.info("Loading test dataset %s" % dataset_name) if type == 'test': dataset_path = os.path.join(opt.test_dataset_root_path, dataset_name, dataset_name + '.test.one2many.pt') elif type == 'valid' and dataset_name in [ 'kp20k', 'stackexchange', 'twacg' ]: dataset_path = os.path.join(opt.test_dataset_root_path, dataset_name, dataset_name + '.valid.one2many.pt') elif type == 'valid' and dataset_name in [ 'inspec', 'nus', 'semeval', 'krapivin', 'duc' ]: dataset_path = os.path.join(opt.test_dataset_root_path, dataset_name, dataset_name + '.train.one2many.pt') else: raise Exception('Unsupported dataset: %s, type=%s' % (dataset_name, type)) one2many_dataset = KeyphraseDataset(dataset_path, word2id=word2id, id2word=id2word, type='one2many', include_original=True, lazy_load=True) one2many_loader = KeyphraseDataLoader( dataset=one2many_dataset, collate_fn=one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=pin_memory, shuffle=False) one2many_loaders.append(one2many_loader) logger.info( '#(%s data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (type, len(one2many_loader.dataset), one2many_loader.one2one_number(), len(one2many_loader))) logger.info('*' * 50) return one2many_loaders, word2id, id2word, vocab