Example #1
0
def export_krapivin_maui():
    # prepare logging.
    config = keyphrase.config.setup_keyphrase_all()  # load settings.

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word, word2idx, config)

    # keep the first 400 in krapivin
    dataset = test_sets['krapivin']

    train_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/train/'
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    train_texts = dataset['source_str'][401:]
    train_targets = dataset['target_str'][401:]
    for i, (train_text,
            train_target) in enumerate(zip(train_texts, train_targets)):
        print('train ' + str(i))
        with open(train_dir + str(i) + '.txt', 'w') as f:
            f.write(' '.join(train_text))
        with open(train_dir + str(i) + '.key', 'w') as f:
            f.write('\n'.join([' '.join(t) + '\t1' for t in train_target]))

    test_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/krapivin/test/'
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
    test_texts = dataset['source_str'][:400]
    test_targets = dataset['target_str'][:400]
    for i, (test_text, test_target) in enumerate(zip(test_texts,
                                                     test_targets)):
        print('test ' + str(i))
        with open(test_dir + str(i) + '.txt', 'w') as f:
            f.write(' '.join(test_text))
        with open(test_dir + str(i) + '.key', 'w') as f:
            f.write('\n'.join([' '.join(t) + '\t1' for t in test_target]))
Example #2
0
def export_UTD():
    # prepare logging.
    config = keyphrase.config.setup_keyphrase_all()  # load settings.

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word, word2idx, config)

    for dataset_name, dataset in test_sets.items():
        print('Exporting %s' % str(dataset_name))

        # keep the first 400 in krapivin
        if dataset_name == 'krapivin':
            dataset['tagged_source'] = dataset['tagged_source'][:400]

        for i, d in enumerate(
                zip(dataset['tagged_source'], dataset['target_str'])):
            source_postag, target = d
            print('[%d/%d]' % (i, len(dataset['tagged_source'])))

            output_text = ' '.join(
                [sp[0] + '_' + sp[1] for sp in source_postag])

            output_dir = config['baseline_data_path'] + dataset_name + '/text/'
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            with open(output_dir + '/' + str(i) + '.txt', 'w') as f:
                f.write(output_text)

            output_text = '\n'.join([' '.join(t) for t in target])
            tag_output_dir = config[
                'baseline_data_path'] + dataset_name + '/keyphrase/'
            if not os.path.exists(tag_output_dir):
                os.makedirs(tag_output_dir)
            with open(tag_output_dir + '/' + str(i) + '.txt', 'w') as f:
                f.write(output_text)
Example #3
0
__author__ = "Rui Meng"
__email__ = "*****@*****.**"

if __name__ == '__main__':
    config = setup_keyphrase_all()  # load settings.

    loader = testing_data_loader('irbooks',
                                 kwargs=dict(basedir=config['path']))
    docs = loader.get_docs(return_dict=True)

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = load_additional_testing_data(config['testing_datasets'],
                                             idx2word,
                                             word2idx,
                                             config,
                                             postagging=False,
                                             process_type=2)

    test_set, test_s_list, test_t_list, test_s_o_list, test_t_o_list, input_encodings, predictions, scores, output_encodings, idx2word \
        = deserialize_from_file(config['predict_path'] + 'predict.{0}.{1}.pkl'.format(config['predict_type'], 'irbooks'))

    do_stem = False

    # Evaluation
    outs, overall_score = keyphrase_utils.evaluate_multiple(
        config,
        test_set,
        test_s_list,
        test_t_list,
        test_s_o_list,
    n_rng = np.random.RandomState(config['seed'])
    np.random.seed(config['seed'])
    rng = RandomStreams(n_rng.randint(2**30))

    logger.info('*' * 20 + '  config information  ' + '*' * 20)
    # print config information
    for k, v in config.items():
        logger.info("\t\t\t\t%s : %s" % (k, v))
    logger.info('*' * 50)

    # data is too large to dump into file, so has to load from raw dataset directly
    # train_set, test_set, idx2word, word2idx = keyphrase_dataset.load_data_and_dict(config['training_dataset'], config['testing_dataset'])

    train_set, validation_set, test_sets, idx2word, word2idx = deserialize_from_file(
        config['dataset'])
    test_sets = keyphrase_test_dataset.load_additional_testing_data(
        ['inspec'], idx2word, word2idx, config, postagging=False)

    logger.info('#(training paper)=%d' % len(train_set['source']))
    logger.info('#(training keyphrase)=%d' %
                sum([len(t) for t in train_set['target']]))
    logger.info(
        '#(testing paper)=%d' %
        sum([len(test_set['target']) for test_set in test_sets.values()]))

    logger.info('Load data done.')

    if config['voc_size'] == -1:  # not use unk
        config['enc_voc_size'] = max(list(zip(*word2idx.items()))[1]) + 1
        config['dec_voc_size'] = config['enc_voc_size']
    else:
        config['enc_voc_size'] = config['voc_size']