コード例 #1
0
from benchmark_reader import Benchmark
from benchmark_reader import select_files


# where to find the corpus
path_to_corpus = '../benchmark/original/test'

# initialise Benchmark object
b = Benchmark()

# collect xml files
files = select_files(path_to_corpus)

# load files to Benchmark
b.fill_benchmark(files)

entities = set()

for entry in b.entries:
   entity = entry.modifiedtripleset.triples[0].s
   entities.add(entity)
   entity = entry.modifiedtripleset.triples[0].o
   entities.add(entity)

#print(entities)
for entity in entities:
    print(entity.replace('_',' '))


コード例 #2
0
            entities[' '.join(word_tokenize(r.replace('_', ' ')))] = r
            all_e.append(h)
            all_e.append(r)
        tgt = process_tgt_test(entities, entry.lexs)
        if tgt == 0:
            continue
        src = process_src(cur_triples, majority.most_common(1)[0][0])
        wf_src.write(src + '\n')
        wf_tgt.write(json.dumps([tgt, all_e]) + '\n')
    wf_tgt.close()
    wf_src.close()


outdir = 'data/webnlg'
b = Benchmark()
files = select_files('webnlg_challenge_2017/train')
b.fill_benchmark(files)

pair_train_src = os.path.join(outdir, "pair_src.train")
pair_train_tgt = os.path.join(outdir, "pair_tgt.train")
convert_dataset(pair_train_src, pair_train_tgt, b)

b = Benchmark()
files = select_files('webnlg_challenge_2017/dev')
b.fill_benchmark(files)

pair_valid_src = os.path.join(outdir, "pair_src.valid")
pair_valid_tgt = os.path.join(outdir, "pair_tgt.valid")
convert_dataset(pair_valid_src, pair_valid_tgt, b)

b = Benchmark()