from benchmark_reader import Benchmark from benchmark_reader import select_files # where to find the corpus path_to_corpus = '../benchmark/original/test' # initialise Benchmark object b = Benchmark() # collect xml files files = select_files(path_to_corpus) # load files to Benchmark b.fill_benchmark(files) entities = set() for entry in b.entries: entity = entry.modifiedtripleset.triples[0].s entities.add(entity) entity = entry.modifiedtripleset.triples[0].o entities.add(entity) #print(entities) for entity in entities: print(entity.replace('_',' '))
entities[' '.join(word_tokenize(r.replace('_', ' ')))] = r all_e.append(h) all_e.append(r) tgt = process_tgt_test(entities, entry.lexs) if tgt == 0: continue src = process_src(cur_triples, majority.most_common(1)[0][0]) wf_src.write(src + '\n') wf_tgt.write(json.dumps([tgt, all_e]) + '\n') wf_tgt.close() wf_src.close() outdir = 'data/webnlg' b = Benchmark() files = select_files('webnlg_challenge_2017/train') b.fill_benchmark(files) pair_train_src = os.path.join(outdir, "pair_src.train") pair_train_tgt = os.path.join(outdir, "pair_tgt.train") convert_dataset(pair_train_src, pair_train_tgt, b) b = Benchmark() files = select_files('webnlg_challenge_2017/dev') b.fill_benchmark(files) pair_valid_src = os.path.join(outdir, "pair_src.valid") pair_valid_tgt = os.path.join(outdir, "pair_tgt.valid") convert_dataset(pair_valid_src, pair_valid_tgt, b) b = Benchmark()