Beispiel #1
0
def load_vocab_and_testsets(opt):
    logger.info("Loading vocab from disk: %s" % (opt.vocab))
    word2id, id2word, vocab = torch.load(opt.vocab, 'rb')
    opt.word2id = word2id
    opt.id2word = id2word
    opt.vocab = vocab
    if not opt.decode_old:
        opt.vocab_size = len(word2id)
    logger.info('#(vocab)=%d' % len(word2id))
    logger.info('#(vocab used)=%d' % len(word2id))

    pin_memory = torch.cuda.is_available() and opt.useGpu
    test_one2many_loaders = []

    for testset_name in opt.test_dataset_names:
        logger.info("Loading test dataset %s" % testset_name)

        print("test_dataset_names")
        print(opt.test_dataset_names)
        print("testset_name")
        print(testset_name)
        print()

        testset_path = os.path.join(opt.test_dataset_root_path, testset_name,
                                    testset_name + '.test.one2many.pt')
        test_one2many = torch.load(testset_path, 'wb')
        test_one2many_dataset = KeyphraseDataset(test_one2many,
                                                 word2id=word2id,
                                                 id2word=id2word,
                                                 type='one2many',
                                                 include_original=True)
        test_one2many_loader = KeyphraseDataLoader(
            dataset=test_one2many_dataset,
            collate_fn=test_one2many_dataset.collate_fn_one2many if opt.useCLF
            else test_one2many_dataset.collate_fn_one2many_noBeginEnd,
            num_workers=opt.batch_workers,
            max_batch_example=opt.beam_search_batch_example,
            max_batch_pair=opt.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)

        test_one2many_loaders.append(test_one2many_loader)
        logger.info(
            '#(test data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d'
            %
            (len(test_one2many_loader.dataset),
             test_one2many_loader.one2one_number(), len(test_one2many_loader)))
        logger.info('*' * 50)

    return test_one2many_loaders, word2id, id2word, vocab
    def process(self, input_str, top_n=8):
        one2one, one2many = self.preprocess_input(input_str)
        # test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets(self.opt,one2one,one2many)
        pin_memory = torch.cuda.is_available()
        testset_name = 'kp20k'
        logger.info("Loading test dataset %s" % testset_name)
        # testset_path = os.path.join(opt.test_dataset_root_path, testset_name, testset_name + '.test.one2many.pt')
        # test_one2many = torch.load(testset_path, 'wb')
        test_one2many_dataset = KeyphraseDataset(
            one2many,
            word2id=self.model_opts.word2id,
            id2word=self.model_opts.id2word,
            type='one2many',
            include_original=True)
        test_one2many_loader = KeyphraseDataLoader(
            dataset=test_one2many_dataset,
            collate_fn=test_one2many_dataset.collate_fn_one2many,
            num_workers=self.model_opts.batch_workers,
            max_batch_example=self.model_opts.beam_search_batch_example,
            max_batch_pair=self.model_opts.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)
        # test_one2many_loaders = [test_one2many_loader]
        # for testset_name, test_data_loader in zip(['kp20k'], test_one2many_loaders):
        # test_data_loader = test_one2many_loader
        logger.info('Evaluating %s' % testset_name)
        output = predict_beam_search(
            self.generator,
            test_one2many_loader,
            self.model_opts,
            title='test_%s' % testset_name,
            predict_save_path=None
        )  #opt.pred_path + '/%s_test_result/' % (testset_name))

        return output[:top_n]
def load_data_vocab(opt, load_train=True):

    logging.info("Loading vocab from disk: %s" % (opt.vocab))
    word2id, id2word, vocab = torch.load(opt.vocab, 'wb')

    # one2one data loader
    logging.info("Loading train and validate data from '%s'" % opt.data)
    '''
    train_one2one  = torch.load(opt.data + '.train.one2one.pt', 'wb')
    valid_one2one  = torch.load(opt.data + '.valid.one2one.pt', 'wb')

    train_one2one_dataset = KeyphraseDataset(train_one2one, word2id=word2id)
    valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2id=word2id)
    train_one2one_loader = DataLoader(dataset=train_one2one_dataset, collate_fn=train_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=True)
    valid_one2one_loader = DataLoader(dataset=valid_one2one_dataset, collate_fn=valid_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False)
    '''

    logging.info('======================  Dataset  =========================')
    # one2many data loader
    if load_train:
        train_one2many = torch.load(opt.data + '.train.one2many.pt', 'wb')
        train_one2many_dataset = KeyphraseDataset(train_one2many, word2id=word2id, id2word=id2word, type='one2many')
        train_one2many_loader  = KeyphraseDataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_pair=opt.batch_size, pin_memory=True, shuffle=True)
        logging.info('#(train data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(train_one2many_loader.dataset), train_one2many_loader.one2one_number(), len(train_one2many_loader)))
    else:
        train_one2many_loader = None

    valid_one2many = torch.load(opt.data + '.valid.one2many.pt', 'wb')
    test_one2many  = torch.load(opt.data + '.test.one2many.pt', 'wb')

    # !important. As it takes too long to do beam search, thus reduce the size of validation and test datasets
    valid_one2many = valid_one2many[:2000]
    test_one2many  = test_one2many[:2000]

    valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)
    test_one2many_dataset  = KeyphraseDataset(test_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)

    """
    # temporary code, exporting test data for Theano model
    for e_id, e in enumerate(test_one2many_dataset.examples):
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'text', '%d.txt' % e_id), 'w') as t_file:
            t_file.write(' '.join(e['src_str']))
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'keyphrase', '%d.txt' % e_id), 'w') as t_file:
            t_file.writelines([(' '.join(t))+'\n' for t in e['trg_str']])
    exit()
    """

    valid_one2many_loader  = KeyphraseDataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)
    test_one2many_loader   = KeyphraseDataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)

    opt.word2id = word2id
    opt.id2word = id2word
    opt.vocab   = vocab

    logging.info('#(valid data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(valid_one2many_loader.dataset), valid_one2many_loader.one2one_number(), len(valid_one2many_loader)))
    logging.info('#(test data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(test_one2many_loader.dataset), test_one2many_loader.one2one_number(), len(test_one2many_loader)))

    logging.info('#(vocab)=%d' % len(vocab))
    logging.info('#(vocab used)=%d' % opt.vocab_size)

    return train_one2many_loader, valid_one2many_loader, test_one2many_loader, word2id, id2word, vocab
def load_data_vocab(opt, load_train=True):

    logging.info("Loading vocab from disk: %s" % (opt.vocab))
    word2id, id2word, vocab = torch.load(opt.vocab, 'wb')

    # one2one data loader
    logging.info("Loading train and validate data from '%s'" % opt.data)
    '''
    train_one2one  = torch.load(opt.data + '.train.one2one.pt', 'wb')
    valid_one2one  = torch.load(opt.data + '.valid.one2one.pt', 'wb')

    train_one2one_dataset = KeyphraseDataset(train_one2one, word2id=word2id)
    valid_one2one_dataset = KeyphraseDataset(valid_one2one, word2id=word2id)
    train_one2one_loader = DataLoader(dataset=train_one2one_dataset, collate_fn=train_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=True)
    valid_one2one_loader = DataLoader(dataset=valid_one2one_dataset, collate_fn=valid_one2one_dataset.collate_fn_one2one, num_workers=opt.batch_workers, batch_size=opt.batch_size, pin_memory=True, shuffle=False)
    '''

    logging.info('======================  Dataset  =========================')
    # one2many data loader
    if load_train:
        train_one2many = torch.load(opt.data + '.train.one2many.pt', 'wb')
        train_one2many_dataset = KeyphraseDataset(train_one2many, word2id=word2id, id2word=id2word, type='one2many')
        train_one2many_loader = KeyphraseDataLoader(dataset=train_one2many_dataset, collate_fn=train_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=1024, max_batch_pair=opt.batch_size, pin_memory=True, shuffle=True)
        logging.info('#(train data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d, #(average examples/batch)=%.3f' % (len(train_one2many_loader.dataset), train_one2many_loader.one2one_number(), len(train_one2many_loader), train_one2many_loader.one2one_number() / len(train_one2many_loader)))
    else:
        train_one2many_loader = None

    valid_one2many = torch.load(opt.data + '.valid.one2many.pt', 'wb')
    test_one2many = torch.load(opt.data + '.test.one2many.pt', 'wb')

    # !important. As it takes too long to do beam search, thus reduce the size of validation and test datasets
    valid_one2many = valid_one2many[:2000]
    test_one2many = test_one2many[:2000]

    valid_one2many_dataset = KeyphraseDataset(valid_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)
    test_one2many_dataset = KeyphraseDataset(test_one2many, word2id=word2id, id2word=id2word, type='one2many', include_original=True)

    """
    # temporary code, exporting test data for Theano model
    for e_id, e in enumerate(test_one2many_dataset.examples):
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'text', '%d.txt' % e_id), 'w') as t_file:
            t_file.write(' '.join(e['src_str']))
        with open(os.path.join('data', 'new_kp20k_for_theano_model', 'keyphrase', '%d.txt' % e_id), 'w') as t_file:
            t_file.writelines([(' '.join(t))+'\n' for t in e['trg_str']])
    exit()
    """

    valid_one2many_loader = KeyphraseDataLoader(dataset=valid_one2many_dataset, collate_fn=valid_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)
    test_one2many_loader = KeyphraseDataLoader(dataset=test_one2many_dataset, collate_fn=test_one2many_dataset.collate_fn_one2many, num_workers=opt.batch_workers, max_batch_example=opt.beam_search_batch_example, max_batch_pair=opt.beam_search_batch_size, pin_memory=True, shuffle=False)

    opt.word2id = word2id
    opt.id2word = id2word
    opt.vocab = vocab

    logging.info('#(valid data size: #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(valid_one2many_loader.dataset), valid_one2many_loader.one2one_number(), len(valid_one2many_loader)))
    logging.info('#(test data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d' % (len(test_one2many_loader.dataset), test_one2many_loader.one2one_number(), len(test_one2many_loader)))

    logging.info('#(vocab)=%d' % len(vocab))
    logging.info('#(vocab used)=%d' % opt.vocab_size)

    return train_one2many_loader, valid_one2many_loader, test_one2many_loader, word2id, id2word, vocab
Beispiel #5
0
def load_vocab_and_datasets_for_testing(dataset_names, type, opt):
    '''
    Load additional datasets from disk
    For now seven datasets are included: 'inspec', 'nus', 'semeval', 'krapivin', 'kp20k', 'duc', 'stackexchange'
     Only 'kp20k', 'stackexchange' provide train/valid/test data.
     The others have only train/test, and the train is mostly used for validation.
    :param type:
    :param opt:
    :return:
    '''
    assert type == 'test' or type == 'valid'

    logger.info("Loading vocab from disk: %s" % (opt.vocab_path))
    word2id, id2word, vocab = torch.load(opt.vocab_path, 'rb')
    logger.info('#(vocab)=%d' % len(vocab))

    pin_memory = torch.cuda.is_available()
    one2many_loaders = []

    for dataset_name in dataset_names:
        logger.info("Loading test dataset %s" % dataset_name)
        if type == 'test':
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.test.one2many.pt')
        elif type == 'valid' and dataset_name in [
                'kp20k', 'stackexchange', 'twacg'
        ]:
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.valid.one2many.pt')
        elif type == 'valid' and dataset_name in [
                'inspec', 'nus', 'semeval', 'krapivin', 'duc'
        ]:
            dataset_path = os.path.join(opt.test_dataset_root_path,
                                        dataset_name,
                                        dataset_name + '.train.one2many.pt')
        else:
            raise Exception('Unsupported dataset: %s, type=%s' %
                            (dataset_name, type))

        one2many_dataset = KeyphraseDataset(dataset_path,
                                            word2id=word2id,
                                            id2word=id2word,
                                            type='one2many',
                                            include_original=True,
                                            lazy_load=True)
        one2many_loader = KeyphraseDataLoader(
            dataset=one2many_dataset,
            collate_fn=one2many_dataset.collate_fn_one2many,
            num_workers=opt.batch_workers,
            max_batch_example=opt.beam_search_batch_example,
            max_batch_pair=opt.beam_search_batch_size,
            pin_memory=pin_memory,
            shuffle=False)

        one2many_loaders.append(one2many_loader)

        logger.info(
            '#(%s data size:  #(one2many pair)=%d, #(one2one pair)=%d, #(batch)=%d'
            % (type, len(one2many_loader.dataset),
               one2many_loader.one2one_number(), len(one2many_loader)))
        logger.info('*' * 50)

    return one2many_loaders, word2id, id2word, vocab