Esempio n. 1
0
def load_data_and_vocab(opt, load_train=True):
    # load vocab
    word2idx, idx2word, vocab = load_vocab(opt)

    # constructor data loader
    logging.info("Loading train and validate data from '%s'" % opt.data)

    if load_train:  # load training dataset
        # load one2many train-dataset & valid-dataset
        if not opt.custom_data_filename_suffix:
            train_one2many = torch.load(opt.data + '/train.one2many.pt', 'wb')
        else:
            train_one2many = torch.load(opt.data + '/train.one2many.%s.pt' % opt.data_filename_suffix, 'wb')
        train_one2many_dataset = KeyphraseDataset(train_one2many, word2idx=word2idx, idx2word=idx2word,
                                                  delimiter_type=opt.delimiter_type, load_train=load_train,
                                                  remove_src_eos=opt.remove_src_eos)
        train_loader = DataLoader(dataset=train_one2many_dataset,
                                  collate_fn=train_one2many_dataset.collate_fn_one2many,
                                  num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=False,
                                  shuffle=True)
        logging.info('#(train data size: #(batch)=%d' % (len(train_loader)))

        if not opt.custom_data_filename_suffix:
            valid_one2many = torch.load(opt.data + '/valid.one2many.pt', 'wb')
        else:
            valid_one2many = torch.load(opt.data + '/valid.one2many.%s.pt' % opt.data_filename_suffix, 'wb')

        valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2idx=word2idx, idx2word=idx2word,
                                                   delimiter_type=opt.delimiter_type,
                                                  load_train=load_train, remove_src_eos=opt.remove_src_eos)
        valid_loader = DataLoader(dataset=valid_one2many_dataset,
                                  collate_fn=valid_one2many_dataset.collate_fn_one2many,
                                  num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=False,
                                  shuffle=False)
        logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader)))
        return train_loader, valid_loader, word2idx, idx2word, vocab
    else:
        if not opt.custom_data_filename_suffix:
            test_one2many = torch.load(opt.data + '/test.one2many.pt', 'wb')
        else:
            test_one2many = torch.load(opt.data + '/test.one2many.%s.pt' % opt.data_filename_suffix, 'wb')
        test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word,
                                                 delimiter_type=opt.delimiter_type,
                                                 load_train=load_train, remove_src_eos=opt.remove_src_eos)
        test_loader = DataLoader(dataset=test_one2many_dataset,
                                 collate_fn=test_one2many_dataset.collate_fn_one2many,
                                 num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True,
                                 shuffle=False)
        logging.info('#(test data size: #(batch)=%d' % (len(test_loader)))

        return test_loader, word2idx, idx2word, vocab
    def process(self, input_str, top_n=8):
        one2one, one2many = self.preprocess_input(input_str)
        # test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets(self.opt,one2one,one2many)
        pin_memory = torch.cuda.is_available()
        testset_name = 'kp20k'
        logger.info("Loading test dataset %s" % testset_name)
        # testset_path = os.path.join(opt.test_dataset_root_path, testset_name, testset_name + '.test.one2many.pt')
        # test_one2many = torch.load(testset_path, 'wb')
        test_one2many_dataset = KeyphraseDataset(
            one2many,
            word2id=self.model_opts.word2id,
            id2word=self.model_opts.id2word,
            type='one2many',
            include_original=True)
        test_one2many_loader = KeyphraseDataLoader(
            dataset=test_one2many_dataset,
            collate_fn=test_one2many_dataset.collate_fn_one2many,
            num_workers=self.model_opts.batch_workers,
            max_batch_example=self.model_opts.beam_search_batch_example,
            max_batch_pair=self.model_opts.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)
        # test_one2many_loaders = [test_one2many_loader]
        # for testset_name, test_data_loader in zip(['kp20k'], test_one2many_loaders):
        # test_data_loader = test_one2many_loader
        logger.info('Evaluating %s' % testset_name)
        output = predict_beam_search(
            self.generator,
            test_one2many_loader,
            self.model_opts,
            title='test_%s' % testset_name,
            predict_save_path=None
        )  #opt.pred_path + '/%s_test_result/' % (testset_name))

        return output[:top_n]
Esempio n. 3
0
def load_vocab_and_testsets(opt):
    logger.info("Loading vocab from disk: %s" % (opt.vocab))
    word2id, id2word, vocab = torch.load(opt.vocab, 'rb')
    opt.word2id = word2id
    opt.id2word = id2word
    opt.vocab = vocab
    if not opt.decode_old:
        opt.vocab_size = len(word2id)
    logger.info('#(vocab)=%d' % len(word2id))
    logger.info('#(vocab used)=%d' % len(word2id))

    pin_memory = torch.cuda.is_available() and opt.useGpu
    test_one2many_loaders = []

    for testset_name in opt.test_dataset_names:
        logger.info("Loading test dataset %s" % testset_name)

        print("test_dataset_names")
        print(opt.test_dataset_names)
        print("testset_name")
        print(testset_name)
        print()

        testset_path = os.path.join(opt.test_dataset_root_path, testset_name,
                                    testset_name + '.test.one2many.pt')
        test_one2many = torch.load(testset_path, 'wb')
        test_one2many_dataset = KeyphraseDataset(test_one2many,
                                                 word2id=word2id,
                                                 id2word=id2word,
                                                 type='one2many',
                                                 include_original=True)
        test_one2many_loader = KeyphraseDataLoader(
            dataset=test_one2many_dataset,
            collate_fn=test_one2many_dataset.collate_fn_one2many if opt.useCLF
            else test_one2many_dataset.collate_fn_one2many_noBeginEnd,
            num_workers=opt.batch_workers,
            max_batch_example=opt.beam_search_batch_example,
            max_batch_pair=opt.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)

        test_one2many_loaders.append(test_one2many_loader)
        logger.info(
            '#(test data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d'
            %
            (len(test_one2many_loader.dataset),
             test_one2many_loader.one2one_number(), len(test_one2many_loader)))
        logger.info('*' * 50)

    return test_one2many_loaders, word2id, id2word, vocab
Esempio n. 4
0
def build_test_dataset(opt):
    # load vocab
    word2idx, idx2word, vocab = load_vocab(opt)
    # load data
    # read tokenized text file and convert them to 2d list of words
    tokenized_src = read_tokenized_src_file(opt.src_file, remove_eos=opt.remove_title_eos, title_guided=False)

    test_one2many = build_interactive_predict_dataset(tokenized_src, word2idx, opt)
    # build the data loader
    test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word,
                                             delimiter_type=opt.delimiter_type, load_train=False,
                                             remove_src_eos=opt.remove_src_eos)
    test_loader = DataLoader(dataset=test_one2many_dataset,
                             collate_fn=test_one2many_dataset.collate_fn_one2many,
                             num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True,
                             shuffle=False)
    return test_loader
Esempio n. 5
0
def main(opt):
    # load vocab
    word2idx, idx2word, vocab = load_vocab(opt)
    # load data
    # read tokenized text file and convert them to 2d list of words
    src_file = opt.src_file
    #trg_file = opt.trg_file
    #tokenized_train_pairs = read_src_and_trg_files(src_file, trg_file, is_train=False, remove_eos=opt.remove_title_eos)  # 2d list of word
    if opt.title_guided:
        tokenized_src, tokenized_title = read_tokenized_src_file(
            src_file, remove_eos=opt.remove_title_eos, title_guided=True)
    else:
        tokenized_src = read_tokenized_src_file(
            src_file, remove_eos=opt.remove_title_eos, title_guided=False)
        tokenized_title = None
    # convert the 2d list of words to a list of dictionary, with keys 'src', 'src_oov', 'trg', 'trg_copy', 'src_str', 'trg_str', 'oov_dict', 'oov_list'
    # since we don't need the targets during testing, 'trg' and 'trg_copy' are some dummy variables
    #test_one2many = build_dataset(tokenized_train_pairs, word2idx, idx2word, opt, mode="one2many", include_original=True)
    test_one2many = build_interactive_predict_dataset(tokenized_src, word2idx,
                                                      idx2word, opt,
                                                      tokenized_title)
    # build the data loader
    test_one2many_dataset = KeyphraseDataset(test_one2many,
                                             word2idx=word2idx,
                                             idx2word=idx2word,
                                             type='one2many',
                                             delimiter_type=opt.delimiter_type,
                                             load_train=False,
                                             remove_src_eos=opt.remove_src_eos,
                                             title_guided=opt.title_guided)
    test_loader = DataLoader(
        dataset=test_one2many_dataset,
        collate_fn=test_one2many_dataset.collate_fn_one2many,
        num_workers=opt.batch_workers,
        batch_size=opt.batch_size,
        pin_memory=True,
        shuffle=False)
    # init the pretrained model
    model = predict.init_pretrained_model(opt)

    # Print out predict path
    print("Prediction path: %s" % opt.pred_path)

    # predict the keyphrases of the src file and output it to opt.pred_path/predictions.txt
    predict.predict(test_loader, model, opt)
def load_data_vocab(opt, load_train=True):

    logging.info("Loading vocab from disk: %s" % (opt.vocab))
    word2id, id2word, vocab = torch.load(opt.vocab, 'wb')

    # one2one data loader
    logging.info("Loading train and validate data from '%s'" % opt.data)
    '''
    train_one2one  = torch.load(opt.data + '.train.one2one.pt', 'wb')
    valid_one2one  = torch.load(opt.data + '.valid.one2one.pt', 'wb')

    train_one2one_dataset = KeyphraseDataset(train_one2one, word2id=word2id)
    valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2id=word2id)
    train_one2one_loader = DataLoader(dataset=train_one2one_dataset, collate_fn=train_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=True)
    valid_one2one_loader = DataLoader(dataset=valid_one2one_dataset, collate_fn=valid_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False)
    '''

    logging.info('======================  Dataset  =========================')
    # one2many data loader
    if load_train:
        train_one2many = torch.load(opt.data + '.train.one2many.pt', 'wb')
        train_one2many_dataset = KeyphraseDataset(train_one2many, word2id=word2id, id2word=id2word, type='one2many')
        train_one2many_loader = KeyphraseDataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=1024, max_batch_pair=opt.batch_size, pin_memory=True, shuffle=True)
        logging.info('#(train data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d, #(average examples/batch)=%.3f' % (len(train_one2many_loader.dataset), train_one2many_loader.one2one_number(), len(train_one2many_loader), train_one2many_loader.one2one_number() / len(train_one2many_loader)))
    else:
        train_one2many_loader = None

    valid_one2many = torch.load(opt.data + '.valid.one2many.pt', 'wb')
    test_one2many = torch.load(opt.data + '.test.one2many.pt', 'wb')

    # !important. As it takes too long to do beam search, thus reduce the size of validation and test datasets
    valid_one2many = valid_one2many[:2000]
    test_one2many = test_one2many[:2000]

    valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)
    test_one2many_dataset = KeyphraseDataset(test_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)

    """
    # temporary code, exporting test data for Theano model
    for e_id, e in enumerate(test_one2many_dataset.examples):
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'text', '%d.txt' % e_id), 'w') as t_file:
            t_file.write(' '.join(e['src_str']))
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'keyphrase', '%d.txt' % e_id), 'w') as t_file:
            t_file.writelines([(' '.join(t))+'\n' for t in e['trg_str']])
    exit()
    """

    valid_one2many_loader = KeyphraseDataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)
    test_one2many_loader = KeyphraseDataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)

    opt.word2id = word2id
    opt.id2word = id2word
    opt.vocab = vocab

    logging.info('#(valid data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(valid_one2many_loader.dataset), valid_one2many_loader.one2one_number(), len(valid_one2many_loader)))
    logging.info('#(test data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(test_one2many_loader.dataset), test_one2many_loader.one2one_number(), len(test_one2many_loader)))

    logging.info('#(vocab)=%d' % len(vocab))
    logging.info('#(vocab used)=%d' % opt.vocab_size)

    return train_one2many_loader, valid_one2many_loader, test_one2many_loader, word2id, id2word, vocab
Esempio n. 7
0
def load_data_and_vocab(opt, load_train=True):
    # load vocab
    #print(opt)
    word2idx, idx2word, vocab = load_vocab()

    # constructor data loader
    logging.info("Loading train and validate data from '%s'" % data_file)

    if load_train:  # load training dataset
        if not opt.one2many:  # load one2one dataset
            if not opt.custom_data_filename_suffix:
                train_one2one = torch.load(data_file+ '/train.one2one.pt', 'wb')
            else:
                train_one2one = torch.load(data_file+ '/train.one2one.%s.pt' % data_filename_suffix, 'wb')
            train_one2one_dataset = KeyphraseDataset(train_one2one, word2idx=word2idx, idx2word=idx2word, type='one2one', load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided)
            train_loader = DataLoader(dataset=train_one2one_dataset,
                                              collate_fn=train_one2one_dataset.collate_fn_one2one,
                                              num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True,
                                              shuffle=True)
            logging.info('#(train data size: #(batch)=%d' % (len(train_loader)))

            if not opt.custom_data_filename_suffix:
                valid_one2one = torch.load(data_file + '/valid.one2one.pt', 'wb')
            else:
                valid_one2one = torch.load(data_file + '/valid.one2one.%s.pt' % data_filename_suffix, 'wb')
            valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2idx=word2idx, idx2word=idx2word,
                                                     type='one2one', load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided)
            valid_loader = DataLoader(dataset=valid_one2one_dataset,
                                      collate_fn=valid_one2one_dataset.collate_fn_one2one,
                                      num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True,
                                      shuffle=False)
            logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader)))


        else:  # load one2many dataset
            if not opt.custom_data_filename_suffix:
                train_one2many = torch.load(data_file + '/train.one2many.pt', 'wb')
            else:
                train_one2many = torch.load(data_file + '/train.one2many.%s.pt' % data_filename_suffix, 'wb')
            train_one2many_dataset = KeyphraseDataset(train_one2many, word2idx=word2idx, idx2word=idx2word, type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided)
            train_loader = DataLoader(dataset=train_one2many_dataset,
                                      collate_fn=train_one2many_dataset.collate_fn_one2many,
                                      num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True,
                                      shuffle=True)
            logging.info('#(train data size: #(batch)=%d' % (len(train_loader)))

            if not opt.custom_data_filename_suffix:
                valid_one2many = torch.load(data_file + '/valid.one2many.pt', 'wb')
            else:
                valid_one2many = torch.load(data_file + '/valid.one2many.%s.pt' % data_filename_suffix, 'wb')
            #valid_one2many = valid_one2many[:2000]
            valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2idx=word2idx, idx2word=idx2word,
                                                      type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided)
            valid_loader = DataLoader(dataset=valid_one2many_dataset,
                                      collate_fn=valid_one2many_dataset.collate_fn_one2many,
                                      num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True,
                                      shuffle=False)
            logging.info('#(valid data size: #(batch)=%d' % (len(valid_loader)))
        return train_loader, valid_loader, word2idx, idx2word, vocab
    else:
        if not opt.custom_data_filename_suffix:
            test_one2many = torch.load(data_file+ '/test.one2many.pt', 'wb')
        else:
            test_one2many = torch.load(data_file + '/test.one2many.%s.pt' % data_filename_suffix, 'wb')
        test_one2many_dataset = KeyphraseDataset(test_one2many, word2idx=word2idx, idx2word=idx2word,
                                                      type='one2many', delimiter_type=opt.delimiter_type, load_train=load_train, remove_src_eos=opt.remove_src_eos, title_guided=opt.title_guided)
        test_loader = DataLoader(dataset=test_one2many_dataset,
                                      collate_fn=test_one2many_dataset.collate_fn_one2many,
                                      num_workers=opt.batch_workers, batch_size=batch_size, pin_memory=True,
                                      shuffle=False)
        logging.info('#(test data size: #(batch)=%d' % (len(test_loader)))

        return test_loader, word2idx, idx2word, vocab
Esempio n. 8
0
def load_vocab_and_datasets_for_testing(dataset_names, type, opt):
    '''
    Load additional datasets from disk
    For now seven datasets are included: 'inspec', 'nus', 'semeval', 'krapivin', 'kp20k', 'duc', 'stackexchange'
     Only 'kp20k', 'stackexchange' provide train/valid/test data.
     The others have only train/test, and the train is mostly used for validation.
    :param type:
    :param opt:
    :return:
    '''
    assert type == 'test' or type == 'valid'

    logger.info("Loading vocab from disk: %s" % (opt.vocab_path))
    word2id, id2word, vocab = torch.load(opt.vocab_path, 'rb')
    logger.info('#(vocab)=%d' % len(vocab))

    pin_memory = torch.cuda.is_available()
    one2many_loaders = []

    for dataset_name in dataset_names:
        logger.info("Loading test dataset %s" % dataset_name)
        if type == 'test':
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.test.one2many.pt')
        elif type == 'valid' and dataset_name in [
                'kp20k', 'stackexchange', 'twacg'
        ]:
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.valid.one2many.pt')
        elif type == 'valid' and dataset_name in [
                'inspec', 'nus', 'semeval', 'krapivin', 'duc'
        ]:
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.train.one2many.pt')
        else:
            raise Exception('Unsupported dataset: %s, type=%s' %
                            (dataset_name, type))

        one2many_dataset = KeyphraseDataset(dataset_path,
                                            word2id=word2id,
                                            id2word=id2word,
                                            type='one2many',
                                            include_original=True,
                                            lazy_load=True)
        one2many_loader = KeyphraseDataLoader(
            dataset=one2many_dataset,
            collate_fn=one2many_dataset.collate_fn_one2many,
            num_workers=opt.batch_workers,
            max_batch_example=opt.beam_search_batch_example,
            max_batch_pair=opt.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)

        one2many_loaders.append(one2many_loader)

        logger.info(
            '#(%s data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d'
            % (type, len(one2many_loader.dataset),
               one2many_loader.one2one_number(), len(one2many_loader)))
        logger.info('*' * 50)

    return one2many_loaders, word2id, id2word, vocab