def input_files(path, filepath=None, relex=False): """ Read the corpus, write train and dev files. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :return: """ parts = ['train', 'dev', 'test'] options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation for part in parts: for option in options: files = select_files(path + part, size=(1, 8)) b = Benchmark() b.fill_benchmark(files) if option == 'all-delex': rplc_list = create_source_target(b, option, part, delex=True) print('Total of {} files processed in {} with {} mode'.format( len(files), part, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, option, part, delex=False) print('Total of {} files processed in {} with {} mode'.format( len(files), part, option)) if relex and part == 'dev' and option == 'all-delex': relexicalise(filepath, rplc_list) print('Files necessary for training/evaluating are written on disc.')
def input_files(path_org, path_pre, filepath, part): files = select_files(path_org + part, size=(1, 8)) b = Benchmark() b.fill_benchmark(files) rplc_list = create_source_target(b, 'all-delex', part, delex=True) relexicalise(filepath, rplc_list, path_pre, part) print('Files necessary for training/evaluating are written on disc.')
def read_rdf_file(file, mode='train'): nodes = [] # [batch, node_num,] in_neigh_indices = [] # [batch, node_num, neighbor_num,] in_neigh_edges = [] out_neigh_indices = [] # [batch, node_num, neighbor_num,] out_neigh_edges = [] sentences = [] # [batch, sent_length,] ids = [] max_in_neigh = 0 max_out_neigh = 0 max_node = 0 max_sent = 0 rplc_list_dev_delex = None options = ['all-notdelex'] # generate files with/without delexicalisation for option in options: b = Benchmark() b.fill_benchmark(file) if mode == 'test': all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_test_data( b, option, delex=False) elif option == 'all-delex': all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_source_target( b, option, file.split('.')[0], delex=True) else: all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_source_target( b, option, file.split('.')[0], delex=False) return all_instances, max_node, max_in_neigh, max_out_neigh, max_sent
def input_ONefile(filepath,name, predpath=None, relex=False): """ Read the corpus, write unseen test files. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :return: """ part ='unseen' options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation folder=name+'Unseen_Baseline' if not os.path.exists(folder): os.makedirs(folder) for option in options: b = Benchmark() # use modified method of benchmark to parse one xml file b.fill_benchmark_file(filepath) if option == 'all-delex': rplc_list = create_source_target(b,option,folder, part, delex=True) print('Total of {} files processed in {} with {} mode'.format(len(filepath), part, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, option,folder,part, delex=False) print('Total of {} files processed in {} with {} mode'.format(len(filepath), part, option)) if relex and part =='unseen' and option == 'all-delex': #predpath=folder+'/'+'baseline_pred_unseen.txt' #print('Path to the file is', predpath) relexicalise(predpath,folder,rplc_list) if ONLINE: with open('delex_dict_dbpd.json', 'w') as f: json.dump(delex_dict_dbpd, f) print('Files necessary for evaluating are written on disc.')
def read_participant(output_file, teamname): # read participant's outputs output = [] with open(output_file, 'r') as f: output += [unidecode(line.strip()) for line in f] b = Benchmark() b.fill_benchmark([(path, goldfile)]) # per size for size in range(1, 8): # print(size) # print('# of instances', b.entry_count(size=str(size))) entry_ids = [] # look up id of a line in the gold benchmark, extract its size for entry in b.entries: if int(entry.size) == size: entry_ids += [int(entry.id[2:])] # entry id -- 'Id1' output_reduced = [output[i - 1] for i in sorted(entry_ids)] write_to_file(output_reduced, str(size) + 'size', teamname) # per category for category in categories: # print(category) # print('# of instances', b.entry_count(cat=category)) entry_ids = [] # look up id of a line in the gold benchmark, extract its category for entry in b.entries: if entry.category == category: entry_ids += [int(entry.id[2:])] # entry id -- 'Id1' output_reduced = [output[i - 1] for i in sorted(entry_ids)] write_to_file(output_reduced, category, teamname) # old categories entry_ids = [] for category in old_categories: # print('# of instances', b.entry_count(cat=category)) # look up id of a line in the gold benchmark, extract its category for entry in b.entries: if entry.category == category: entry_ids += [int(entry.id[2:])] # entry id -- 'Id1' output_reduced = [output[i - 1] for i in sorted(entry_ids)] write_to_file(output_reduced, 'old-cat', teamname) # new categories entry_ids = [] for category in new_categories: # print('# of instances', b.entry_count(cat=category)) # look up id of a line in the gold benchmark, extract its category for entry in b.entries: if entry.category == category: entry_ids += [int(entry.id[2:])] # entry id -- 'Id1' output_reduced = [output[i - 1] for i in sorted(entry_ids)] write_to_file(output_reduced, 'new-cat', teamname) # create all-category files write_to_file(output, 'all-cat', teamname) print('Files creating finished for: ', teamname)
def input_files(path, filepath=None, relex=False): """ Read the corpus, write train and dev files. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :return: """ part = 'dev' options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation #files is list of list of tuple for all files in each triple folder for option in options: files = select_files(path, size=(1, 8)) #take general list of files # print(files) i = 0 for file1 in files: #take each list of tuples i += 1 b = Benchmark() #print(file1) b.fill_benchmark(file1) #read all xml files in triple1 for example print('Its triple size' + str(i)) #create new folder for each triple size folder = 'dev_triple' + str(i) if not os.path.exists(folder): os.makedirs(folder) if option == 'all-delex': rplc_list = create_source_target(b, i, option, folder, part, delex=True) print('Total of {} files processed in {} with {} mode'.format( len(file1), part, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, i, option, folder, part, delex=False) print('Total of {} files processed in {} with {} mode'.format( len(file1), part, option)) # take each pred file for current triple size if relex and part == 'dev' and option == 'all-delex': sizeT = i filepath = folder + '/' + 'baseline_pred_triple' + str( sizeT) + '.txt' print('Path to the file is', filepath) relexicalise(filepath, folder, sizeT, rplc_list) if ONLINE: with open('delex_dict_dbpd.json', 'w') as f: json.dump(delex_dict_dbpd, f) print('Files necessary for evaluating are written on disc.')
def input_files(path, filepath=None, relex=False): """ Read the corpus, write files for chosen DBpedia category. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :return: """ part = 'dev' options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation categories = ['Astronaut', 'University', 'Food', 'SportsTeam', 'City'] for category in categories: #create new folder for current category folder = 'dev_' + category if not os.path.exists(folder): os.makedirs(folder) for option in options: files = select_files(path, category, size=(1, 8)) b = Benchmark() b.fill_benchmark(files) if option == 'all-delex': rplc_list = create_source_target(b, option, folder, part, delex=True) print('Total of {} files processed in {} with {} mode'.format( len(files), category, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, option, folder, part, delex=False) print('Total of {} files processed in {} with {} mode'.format( len(files), category, option)) # take each pred file for current triple size if relex and part == 'dev' and option == 'all-delex': #filepath:path to find the output of seq2seq prediction file filepath = folder + '/baseline_pred.txt' print('Path to the file is', filepath) relexicalise(filepath, folder, rplc_list) #files is list of list of tuple for all files in each triple folder if ONLINE: with open('delex_dict_dbpd.json', 'w') as f: json.dump(delex_dict_dbpd, f) print('Files necessary for evaluating are written on disc.')
h = h.replace('"', '').replace('_', ' ') t = t.replace('"', '').replace('_', ' ') r = r.replace('"', '').replace('_', ' ') cur_triples.append((h, r, t)) tgt = process_tgt_test(entry.lexs) src = process_src(cur_triples) wf_src.write(src + '\n') wf_tgt.write(json.dumps(tgt) + '\n') wf_tgt.close() wf_src.close() return eids outdir = 'data/webnlg_tag' b = Benchmark() files = [('webnlg_challenge_2017/test', 'testdata_unseen_with_lex.xml')] b.fill_benchmark(files) pair_valid_src = os.path.join(outdir, "unseen.source") pair_valid_tgt = os.path.join(outdir, "unseen.target") eids = convert_dataset_test(pair_valid_src, pair_valid_tgt, b) a = Benchmark() files = [('webnlg_challenge_2017/test', 'testdata_with_lex.xml')] a.fill_benchmark(files) pair_valid_src = os.path.join(outdir, "seen.source") pair_valid_tgt = os.path.join(outdir, "seen.target") eids = convert_dataset_test_1(pair_valid_src, pair_valid_tgt, a, eids) # files =[('webnlg_challenge_2017/test', 'testdata_with_lex.xml')] # pair_valid_src = os.path.join(outdir, "test.source")
def generate_files(): # generate files per category for cat in categories: b = Benchmark() b.fill_benchmark([(path, goldfile)]) # print(cat + ': ' + str(b.entry_count(cat=cat))) b_reduced = b.filter([], [cat]) # print('reduced', b_reduced.entry_count(cat=cat)) # metric files generation; we use three references bleu_ref_files_gen(b_reduced, cat) # meteor_ref_files_gen(b_reduced, cat) meteor_3ref_files_gen(b_reduced, cat) # ter_ref_files_gen(b_reduced, cat) ter_ref_files_gen(b_reduced, cat, True) # ter_3ref_space_files_gen(b_reduced, cat) # generate files per size for size in range(1, 8): b = Benchmark() b.fill_benchmark([(path, goldfile)]) # print(str(size) + ': ' + str(b.entry_count(size=str(size)))) b_reduced = b.filter([size], []) # print('reduced', b_reduced.entry_count(size=str(size))) bleu_ref_files_gen(b_reduced, str(size) + 'size') # meteor_ref_files_gen(b_reduced, str(size) + 'size') meteor_3ref_files_gen(b_reduced, str(size) + 'size') # ter_ref_files_gen(b_reduced, str(size) + 'size') ter_ref_files_gen(b_reduced, str(size) + 'size', True) # ter_3ref_space_files_gen(b_reduced, str(size) + 'size') # generate files per type: old, new, all categories b = Benchmark() b.fill_benchmark([(path, goldfile)]) print('Gold count', b.entry_count()) # metric files generation for all cats bleu_ref_files_gen(b, 'all-cat') ter_3ref_space_files_gen( b, 'all-cat') # need this format for significance testing # meteor_ref_files_gen(b, 'all-cat') meteor_3ref_files_gen(b, 'all-cat') # ter_ref_files_gen(b, 'all-cat') ter_ref_files_gen(b, 'all-cat', True) b_reduced = b.filter([], new_categories) print('reduced (new)', b_reduced.entry_count()) # metric files generation for new cats bleu_ref_files_gen(b_reduced, 'new-cat') ter_3ref_space_files_gen( b_reduced, 'new-cat') # need this format for significance testing # meteor_ref_files_gen(b_reduced, 'new-cat') meteor_3ref_files_gen(b_reduced, 'new-cat') # ter_ref_files_gen(b_reduced, 'new-cat') ter_ref_files_gen(b_reduced, 'new-cat', True) bk = Benchmark() bk.fill_benchmark([(path, goldfile)]) bk_reduced = bk.filter([], old_categories) print('reduced (old)', bk_reduced.entry_count()) # metric files generation for old cats bleu_ref_files_gen(bk_reduced, 'old-cat') ter_3ref_space_files_gen( bk_reduced, 'old-cat') # need this format for significance testing # meteor_ref_files_gen(bk_reduced, 'old-cat') meteor_3ref_files_gen(bk_reduced, 'old-cat') # ter_ref_files_gen(bk_reduced, 'old-cat') ter_ref_files_gen(bk_reduced, 'old-cat', True)
def input_files(path, filepath=None, relex=False, parts=['train', 'dev'], doCategory=[], negraph=True, lowercased=True, fileid=None): """ Read the corpus, write train and dev files. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :param parts: partition to process :param negraph: whether to add edges for multi-word entitites :param lowercased: whether to do all lowercased for the notdelex version of the files :return: """ rplc_list_dev_delex = None options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation for part in parts: for option in options: if part.startswith('test'): files = select_files(path + part, size=0) else: files = select_files(path + part, size=(1, 8)) if doCategory == UNSEEN_CATEGORIES: files = [files[1]] b = Benchmark() b.fill_benchmark(files) if option == 'all-delex': rplc_list = create_source_target(b, option, part, delex=True, relex=relex, doCategory=doCategory, negraph=negraph, lowercased=False) print('Total of {} files processed in {} with {} mode'.format( len(files), part, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, option, part, delex=False, relex=relex, doCategory=doCategory, negraph=negraph, lowercased=lowercased) print('Total of {} files processed in {} with {} mode'.format( len(files), part, option)) if (part == 'dev' or part.startswith('test')) and option == 'all-delex': rplc_list_dev_delex = rplc_list if relex and rplc_list_dev_delex: relexicalise(filepath, rplc_list_dev_delex, fileid, part, lowercased=lowercased) print('Files necessary for training/evaluating are written on disc.')
entities[' '.join(word_tokenize(h.replace('_', ' ')))] = h entities[' '.join(word_tokenize(r.replace('_', ' ')))] = r all_e.append(h) all_e.append(r) tgt = process_tgt_test(entities, entry.lexs) if tgt == 0: continue src = process_src(cur_triples, majority.most_common(1)[0][0]) wf_src.write(src + '\n') wf_tgt.write(json.dumps([tgt, all_e]) + '\n') wf_tgt.close() wf_src.close() outdir = 'data/webnlg' b = Benchmark() files = select_files('webnlg_challenge_2017/train') b.fill_benchmark(files) pair_train_src = os.path.join(outdir, "pair_src.train") pair_train_tgt = os.path.join(outdir, "pair_tgt.train") convert_dataset(pair_train_src, pair_train_tgt, b) b = Benchmark() files = select_files('webnlg_challenge_2017/dev') b.fill_benchmark(files) pair_valid_src = os.path.join(outdir, "pair_src.valid") pair_valid_tgt = os.path.join(outdir, "pair_tgt.valid") convert_dataset(pair_valid_src, pair_valid_tgt, b)
def input_files(path, filepath=None, relex=False, parts=['train', 'dev'], doCategory=[], options=None, negraph=True, lowercased=True, fileid=None): """ Read the corpus, write train and dev files. :param path: directory with the WebNLG benchmark :param filepath: path to the prediction file with sentences (for relexicalisation) :param relex: boolean; do relexicalisation or not :param parts: partition to process :param negraph: whether to add edges for multi-word entitites :param lowercased: whether to do all lowercased for the notdelex version of the files :return: """ rplc_list_relex = None # options = ['all-delex', 'all-notdelex'] # generate files with/without delexicalisation if options is None: options = ['all-delex', 'all-notdelex' ] # generate files with/without delexicalisation else: assert isinstance(options, list) assert len(set(options) - {'all-notdelex', 'all-delex'}) == 0 for part in parts: for option in options: files = select_files(path + part, size=(1, 8)) b = Benchmark() b.fill_benchmark(files) if option == 'all-delex': rplc_list = create_source_target(b, option, part, delex=True, relex=relex, doCategory=doCategory, negraph=negraph, lowercased=lowercased, is_train=(part == "train")) print('Total of {} instances processed in {} with {} mode'. format(len(rplc_list), part, option)) elif option == 'all-notdelex': rplc_list = create_source_target(b, option, part, delex=False, relex=relex, doCategory=doCategory, negraph=negraph, lowercased=lowercased, is_train=(part == "train")) print('Total of {} instances processed in {} with {} mode'. format(len(rplc_list), part, option)) if option == 'all-delex': rplc_list_relex = rplc_list if relex and rplc_list_relex: relexicalise(filepath, rplc_list_relex, fileid, part, lowercased=lowercased, doCategory=doCategory) print('Files necessary for training/evaluating are written on disc.')