intents_files = [
            basedir + train_intents_file, basedir + valid_intents_file,
            basedir + test_intents_file
        ]

    web_files = []
    if add_web:
        web_files = [
            basedir + train_web,
            basedir + valid_web,
            basedir + test_web,
        ]

    corpus, rels_train, rels_valid, rels_test, rels_intents, all_rels_web = prepare.run_with_train_valid_test_corpus_dmn(
        basedir + train_file,
        basedir + valid_file,
        basedir + test_file,
        intents_files=intents_files,
        web_files=web_files)

    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
            index = 0
        elif data_part == 'valid':
            rels = rels_valid
            index = 1
        else:
            rels = rels_test
            index = 2

        print 'total relations in ', data_part, len(rels)
Ejemplo n.º 2
0
    elif data_name == 'mantis_10':
        train_file = 'data_train_easy.tsv'
        valid_file = 'data_dev_easy.tsv'
        test_file = 'data_test_easy.tsv'
    elif data_name == 'mantis_50':
        train_file = 'data_train.tsv'
        valid_file = 'data_dev.tsv'
        test_file = 'data_test.tsv'
    elif data_name == 'L4':
        train_file = 'train.tsv'
        valid_file = 'dev.tsv'
        test_file = 'test.tsv'
    else:
        raise ValueError('invalid data name!')

    corpus, rels_train, rels_valid, rels_test = prepare.run_with_train_valid_test_corpus_dmn(
        basedir + train_file, basedir + valid_file, basedir + test_file)
    for data_part in list(['train', 'valid', 'test']):
        if data_part == 'train':
            rels = rels_train
        elif data_part == 'valid':
            rels = rels_valid
        else:
            rels = rels_test
        print 'total relations in ', data_part, len(rels)
        prepare.save_relation(cur_data_dir + 'relation_' + data_part + '.txt',
                              rels)
        print 'filter queries with duplicated doc ids...'
        prepare.check_filter_query_with_dup_doc(cur_data_dir + 'relation_' +
                                                data_part + '.txt')
    print 'total corpus ', len(corpus)
    prepare.save_corpus_dmn(cur_data_dir + 'corpus.txt', corpus, '\t')