Example #1
0
    # the format of each line is 'label \t context \t response'
    prepare = Preparation()
    # run with three files (train.txt.mz, valid.txt.mz, test.txt.mz) to generate unique ids
    # for q/d in train/valid/test data. Since we will merge these three corpus files into a single file later
    corpus, rels_train, rels_valid, rels_test = run_with_train_valid_test_corpus_given_qid_did_gen_unique_id_for_genres(
        basedir + 'train.mz', basedir + 'valid.mz', basedir + 'test.mz')

    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
        elif data_part == 'valid':
            rels = rels_valid
        else:
            rels = rels_test
        print 'total relations in ', data_part, len(rels)
        prepare.save_relation(basedir + 'relation_' + data_part + '.txt', rels)
        if save_space == '0':
            print 'filter queries with duplicated doc ids...'
            prepare.check_filter_query_with_dup_doc(basedir + 'relation_' +
                                                    data_part + '.txt')
    print 'total corpus ', len(corpus)
    if save_space == '0':
        prepare.save_corpus(basedir + 'corpus.txt', corpus)
    print 'preparation finished ...'

    if need_preprocess == '1':
        print 'begin preprocess...'
        # Prerpocess corpus file
        preprocessor = Preprocess(word_filter_config={'min_freq': 2})
        dids, docs = preprocessor.run(basedir + 'corpus.txt')
        preprocessor.save_word_dict(basedir + 'word_dict.txt')
Example #2
0
    srcdir = './'
    dstdir = './'

    infiles = [
        srcdir + 'MSMARCO-small-mz-train.txt', srcdir + 'MSMARCO-mz-dev.txt',
        srcdir + 'MSMARCO-mz-test.txt'
    ]
    corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(
        infiles[0], infiles[1], infiles[2])
    print('total corpus : %d ...' % (len(corpus)))
    print('total relation-train : %d ...' % (len(rel_train)))
    print('total relation-valid : %d ...' % (len(rel_valid)))
    print('total relation-test: %d ...' % (len(rel_test)))
    prepare.save_corpus(dstdir + 'corpus.txt', corpus)

    prepare.save_relation(dstdir + 'relation_train.txt', rel_train)
    prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid)
    prepare.save_relation(dstdir + 'relation_test.txt', rel_test)
    print('Preparation finished ...')

    preprocessor = Preprocess(word_stem_config={'enable': False},
                              word_filter_config={'min_freq': 2})
    dids, docs = preprocessor.run(dstdir + 'corpus.txt')
    preprocessor.save_word_dict(dstdir + 'word_dict.txt', True)
    preprocessor.save_words_stats(dstdir + 'word_stats.txt', True)

    fout = open(dstdir + 'corpus_preprocessed.txt', 'w')
    for inum, did in enumerate(dids):
        fout.write('%s %s %s\n' %
                   (did, len(docs[inum]), ' '.join(map(str, docs[inum]))))
    fout.close()
Example #3
0

if __name__ == '__main__':
    prepare = Preparation()
    srcdir = './'
    dstdir = './'

    infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt']
    corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2])
    print('total corpus : %d ...' % (len(corpus)))
    print('total relation-train : %d ...' % (len(rel_train)))
    print('total relation-valid : %d ...' % (len(rel_valid)))
    print('total relation-test: %d ...' % (len(rel_test)))
    prepare.save_corpus(dstdir + 'corpus.txt', corpus)

    prepare.save_relation(dstdir + 'relation_train.txt', rel_train)
    prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid)
    prepare.save_relation(dstdir + 'relation_test.txt', rel_test)
    print('Preparation finished ...')

    preprocessor = Preprocess(word_stem_config={'enable': False}, word_filter_config={'min_freq': 2})
    dids, docs = preprocessor.run(dstdir + 'corpus.txt')
    preprocessor.save_word_dict(dstdir + 'word_dict.txt', True)
    preprocessor.save_words_stats(dstdir + 'word_stats.txt', True)

    fout = open(dstdir + 'corpus_preprocessed.txt', 'w')
    for inum, did in enumerate(dids):
        fout.write('%s %s %s\n' % (did, len(docs[inum]), ' '.join(map(str, docs[inum]))))
    fout.close()
    print('Preprocess finished ...')
        intents_files=intents_files,
        web_files=web_files)

    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
            index = 0
        elif data_part == 'valid':
            rels = rels_valid
            index = 1
        else:
            rels = rels_test
            index = 2

        print 'total relations in ', data_part, len(rels)
        prepare.save_relation(cur_data_dir + 'relation_' + data_part + '.txt',
                              rels)
        print 'filter queries with duplicated doc ids...'
        prepare.check_filter_query_with_dup_doc(cur_data_dir + 'relation_' +
                                                data_part + '.txt')

        rels_intent = None
        if add_intents:
            rels_intent = rels_intents[index]

            print '[Intents] total relations in ', data_part, len(rels_intent)
            prepare.save_relation_intents(
                cur_data_dir + 'relation_' + data_part + '_intents.txt',
                rels_intent)
            print '[Intents] filter queries with duplicated doc ids...'
            prepare.check_filter_query_with_dup_doc(cur_data_dir +
                                                    'relation_' + data_part +
        valid_file = 'intent_data_dev.tsv'
        test_file = 'intent_data_test.tsv'
    else:
        raise ValueError('invalid data name!')

    corpus, rels_train, rels_valid, rels_test, _ = prepare.run_with_train_valid_test_corpus_dmn(
        basedir + train_file, basedir + valid_file, basedir + test_file)
    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
        elif data_part == 'valid':
            rels = rels_valid
        else:
            rels = rels_test
        print 'total relations in ', data_part, len(rels)
        prepare.save_relation(
            cur_data_dir + 'relation_' + data_part + '_only_intents.txt', rels)
        print 'filter queries with duplicated doc ids...'
        prepare.check_filter_query_with_dup_doc(cur_data_dir + 'relation_' +
                                                data_part +
                                                '_only_intents.txt')
    print 'total corpus ', len(corpus)
    prepare.save_corpus_dmn(cur_data_dir + 'corpus.txt', corpus, '\t')
    print 'preparation finished ...'

    print 'begin preprocess...'
    # Prerpocess corpus file
    preprocessor = Preprocess(word_filter_config={'min_freq': 5})
    dids, docs = preprocessor.run_2d(
        cur_data_dir +
        'corpus.txt')  # docs is [corpus_size, utterance_num, max_text1_len]
    preprocessor.save_word_dict(cur_data_dir + 'word_dict.txt')