#assert len(doc[r[0]]) == int(r[1]) return doc def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000): tri_dict = {} tri_stats = sorted(tri_stats.items(), key=lambda d:d[1], reverse=True) for triinfo in tri_stats: if triinfo[1] >= min_filter_num and triinfo[1] <= max_filter_num: if triinfo[0] not in tri_dict: tri_dict[triinfo[0]] = len(tri_dict) return tri_dict if __name__ == '__main__': prepare = Preparation() srcdir = './' dstdir = './' infile = srcdir + 'quora_duplicate_questions.tsv' #infile = srcdir + 'train.csv' corpus, rels = prepare.run_with_one_corpus_for_quora(infile) print('total corpus : %d ...' % (len(corpus))) print('total relations : %d ...' % (len(rels))) prepare.save_corpus(dstdir + 'corpus.txt', corpus) rel_train, rel_valid, rel_test = prepare.split_train_valid_test(rels, [0.8, 0.1, 0.1]) prepare.save_relation(dstdir + 'relation_train.txt', rel_train) prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid) prepare.save_relation(dstdir + 'relation_test.txt', rel_test) print('Preparation finished ...')
#assert len(doc[r[0]]) == int(r[1]) return doc def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000): tri_dict = {} tri_stats = sorted(tri_stats.items(), key=lambda d:d[1], reverse=True) for triinfo in tri_stats: if min_filter_num <= triinfo[1] <= max_filter_num: if triinfo[0] not in tri_dict: tri_dict[triinfo[0]] = len(tri_dict) return tri_dict if __name__ == '__main__': prepare = Preparation() srcdir = './' dstdir = './' infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt'] corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus(infiles[0], infiles[1], infiles[2]) print('total corpus : %d ...' % (len(corpus))) print('total relation-train : %d ...' % (len(rel_train))) print('total relation-valid : %d ...' % (len(rel_valid))) print('total relation-test: %d ...' % (len(rel_test))) prepare.save_corpus(dstdir + 'corpus.txt', corpus) prepare.save_relation(dstdir + 'relation_train.txt', rel_train) prepare.save_relation(dstdir + 'relation_valid.txt', rel_valid) prepare.save_relation(dstdir + 'relation_test.txt', rel_test) print('Preparation finished ...')
#assert len(doc[r[0]]) == int(r[1]) return doc def filter_triletter(tri_stats, min_filter_num=5, max_filter_num=10000): tri_dict = {} tri_stats = sorted(tri_stats.items(), key=lambda d: d[1], reverse=True) for triinfo in tri_stats: if min_filter_num <= triinfo[1] <= max_filter_num: if triinfo[0] not in tri_dict: tri_dict[triinfo[0]] = len(tri_dict) return tri_dict if __name__ == '__main__': prepare = Preparation() srcdir = './' dstdir = './' infiles = [ srcdir + 'WikiQA-mz-train.txt', srcdir + 'WikiQA-mz-dev.txt', srcdir + 'WikiQA-mz-test.txt' ] corpus, rel_train, rel_valid, rel_test = prepare.run_with_train_valid_test_corpus( infiles[0], infiles[1], infiles[2]) print('total corpus : %d ...' % (len(corpus))) print('total relation-train : %d ...' % (len(rel_train))) print('total relation-valid : %d ...' % (len(rel_valid))) print('total relation-test: %d ...' % (len(rel_test))) prepare.save_corpus(dstdir + 'corpus.txt', corpus)