Beispiel #1
0
def input_files(path, filepath=None, relex=False):
    """
	Read the corpus, write train and dev files.
	:param path: directory with the WebNLG benchmark
	:param filepath: path to the prediction file with sentences (for relexicalisation)
	:param relex: boolean; do relexicalisation or not
	:return:
	"""
    parts = ['train', 'dev', 'test']
    options = ['all-delex',
               'all-notdelex']  # generate files with/without delexicalisation
    for part in parts:
        for option in options:
            files = select_files(path + part, size=(1, 8))
            b = Benchmark()
            b.fill_benchmark(files)
            if option == 'all-delex':
                rplc_list = create_source_target(b, option, part, delex=True)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), part, option))
            elif option == 'all-notdelex':
                rplc_list = create_source_target(b, option, part, delex=False)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), part, option))
            if relex and part == 'dev' and option == 'all-delex':
                relexicalise(filepath, rplc_list)
    print('Files necessary for training/evaluating are written on disc.')
Beispiel #2
0
def input_files(path_org, path_pre, filepath, part):
    files = select_files(path_org + part, size=(1, 8))
    b = Benchmark()
    b.fill_benchmark(files)
    rplc_list = create_source_target(b, 'all-delex', part, delex=True)
    relexicalise(filepath, rplc_list, path_pre, part)
    print('Files necessary for training/evaluating are written on disc.')
Beispiel #3
0
def read_rdf_file(file, mode='train'):
    nodes = []  # [batch, node_num,]
    in_neigh_indices = []  # [batch, node_num, neighbor_num,]
    in_neigh_edges = []
    out_neigh_indices = []  # [batch, node_num, neighbor_num,]
    out_neigh_edges = []
    sentences = []  # [batch, sent_length,]
    ids = []
    max_in_neigh = 0
    max_out_neigh = 0
    max_node = 0
    max_sent = 0
    rplc_list_dev_delex = None
    options = ['all-notdelex']  # generate files with/without delexicalisation
    for option in options:
        b = Benchmark()
        b.fill_benchmark(file)
        if mode == 'test':
            all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_test_data(
                b, option, delex=False)
        elif option == 'all-delex':
            all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_source_target(
                b, option, file.split('.')[0], delex=True)
        else:
            all_instances, max_node, max_in_neigh, max_out_neigh, max_sent = create_source_target(
                b, option, file.split('.')[0], delex=False)

    return all_instances, max_node, max_in_neigh, max_out_neigh, max_sent
Beispiel #4
0
def input_ONefile(filepath,name, predpath=None, relex=False):
    """
    Read the corpus, write unseen test files.
    :param path: directory with the WebNLG benchmark
    :param filepath: path to the prediction file with sentences (for relexicalisation)
    :param relex: boolean; do relexicalisation or not
    :return:
    """

    part ='unseen'
    options = ['all-delex', 'all-notdelex']  # generate files with/without delexicalisation  
    folder=name+'Unseen_Baseline'
    if not os.path.exists(folder):
       os.makedirs(folder)
    for option in options:
        b = Benchmark()
     # use modified method of benchmark to parse one xml file
        b.fill_benchmark_file(filepath)     
        if option == 'all-delex':
            rplc_list = create_source_target(b,option,folder, part, delex=True)
            print('Total of {} files processed in {} with {} mode'.format(len(filepath), part, option))
        elif option == 'all-notdelex':
            rplc_list = create_source_target(b, option,folder,part, delex=False)
            print('Total of {} files processed in {} with {} mode'.format(len(filepath), part, option))
        if relex and part =='unseen' and option == 'all-delex':
            #predpath=folder+'/'+'baseline_pred_unseen.txt'
            #print('Path to the file is', predpath)
            relexicalise(predpath,folder,rplc_list)
    
    if ONLINE:
      with open('delex_dict_dbpd.json', 'w') as f:
          json.dump(delex_dict_dbpd, f)   
    print('Files necessary for evaluating are written on disc.')
Beispiel #5
0
def read_participant(output_file, teamname):
    # read participant's outputs
    output = []
    with open(output_file, 'r') as f:
        output += [unidecode(line.strip()) for line in f]

    b = Benchmark()
    b.fill_benchmark([(path, goldfile)])

    # per size
    for size in range(1, 8):
        # print(size)
        # print('# of instances', b.entry_count(size=str(size)))
        entry_ids = []
        # look up id of a line in the gold benchmark, extract its size
        for entry in b.entries:
            if int(entry.size) == size:
                entry_ids += [int(entry.id[2:])]  # entry id -- 'Id1'
        output_reduced = [output[i - 1] for i in sorted(entry_ids)]
        write_to_file(output_reduced, str(size) + 'size', teamname)

    # per category
    for category in categories:
        # print(category)
        # print('# of instances', b.entry_count(cat=category))
        entry_ids = []
        # look up id of a line in the gold benchmark, extract its category
        for entry in b.entries:
            if entry.category == category:
                entry_ids += [int(entry.id[2:])]  # entry id -- 'Id1'
        output_reduced = [output[i - 1] for i in sorted(entry_ids)]
        write_to_file(output_reduced, category, teamname)

    # old categories
    entry_ids = []
    for category in old_categories:
        # print('# of instances', b.entry_count(cat=category))
        # look up id of a line in the gold benchmark, extract its category
        for entry in b.entries:
            if entry.category == category:
                entry_ids += [int(entry.id[2:])]  # entry id -- 'Id1'
    output_reduced = [output[i - 1] for i in sorted(entry_ids)]
    write_to_file(output_reduced, 'old-cat', teamname)

    # new categories
    entry_ids = []
    for category in new_categories:
        # print('# of instances', b.entry_count(cat=category))
        # look up id of a line in the gold benchmark, extract its category
        for entry in b.entries:
            if entry.category == category:
                entry_ids += [int(entry.id[2:])]  # entry id -- 'Id1'
    output_reduced = [output[i - 1] for i in sorted(entry_ids)]
    write_to_file(output_reduced, 'new-cat', teamname)

    # create all-category files
    write_to_file(output, 'all-cat', teamname)
    print('Files creating finished for: ', teamname)
Beispiel #6
0
def input_files(path, filepath=None, relex=False):
    """
    Read the corpus, write train and dev files.
    :param path: directory with the WebNLG benchmark
    :param filepath: path to the prediction file with sentences (for relexicalisation)
    :param relex: boolean; do relexicalisation or not
    :return:
    """
    part = 'dev'
    options = ['all-delex',
               'all-notdelex']  # generate files with/without delexicalisation
    #files is list of list of tuple for all files in each triple folder
    for option in options:
        files = select_files(path, size=(1, 8))  #take general list of files
        # print(files)
        i = 0
        for file1 in files:  #take each list of tuples
            i += 1
            b = Benchmark()
            #print(file1)
            b.fill_benchmark(file1)  #read all xml files in triple1 for example
            print('Its triple size' + str(i))
            #create new folder for each triple size
            folder = 'dev_triple' + str(i)
            if not os.path.exists(folder):
                os.makedirs(folder)
            if option == 'all-delex':
                rplc_list = create_source_target(b,
                                                 i,
                                                 option,
                                                 folder,
                                                 part,
                                                 delex=True)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(file1), part, option))
            elif option == 'all-notdelex':
                rplc_list = create_source_target(b,
                                                 i,
                                                 option,
                                                 folder,
                                                 part,
                                                 delex=False)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(file1), part, option))
            # take each pred file for current triple size
            if relex and part == 'dev' and option == 'all-delex':
                sizeT = i
                filepath = folder + '/' + 'baseline_pred_triple' + str(
                    sizeT) + '.txt'
                print('Path to the file is', filepath)
                relexicalise(filepath, folder, sizeT, rplc_list)

    if ONLINE:
        with open('delex_dict_dbpd.json', 'w') as f:
            json.dump(delex_dict_dbpd, f)

    print('Files necessary for evaluating are written on disc.')
Beispiel #7
0
def input_files(path, filepath=None, relex=False):
    """
    Read the corpus, write files for chosen DBpedia category.
    :param path: directory with the WebNLG benchmark
    :param filepath: path to the prediction file with sentences (for relexicalisation)
    :param relex: boolean; do relexicalisation or not
    :return:
    """
    part = 'dev'
    options = ['all-delex',
               'all-notdelex']  # generate files with/without delexicalisation
    categories = ['Astronaut', 'University', 'Food', 'SportsTeam', 'City']
    for category in categories:
        #create new folder for current category
        folder = 'dev_' + category
        if not os.path.exists(folder):
            os.makedirs(folder)
        for option in options:
            files = select_files(path, category, size=(1, 8))
            b = Benchmark()
            b.fill_benchmark(files)
            if option == 'all-delex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 folder,
                                                 part,
                                                 delex=True)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), category, option))
            elif option == 'all-notdelex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 folder,
                                                 part,
                                                 delex=False)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), category, option))
        # take each pred file for current triple size
            if relex and part == 'dev' and option == 'all-delex':
                #filepath:path to find the output of seq2seq prediction file
                filepath = folder + '/baseline_pred.txt'
                print('Path to the file is', filepath)
                relexicalise(filepath, folder, rplc_list)


#files is list of list of tuple for all files in each triple folder

    if ONLINE:
        with open('delex_dict_dbpd.json', 'w') as f:
            json.dump(delex_dict_dbpd, f)
    print('Files necessary for evaluating are written on disc.')
            h = h.replace('"', '').replace('_', ' ')
            t = t.replace('"', '').replace('_', ' ')
            r = r.replace('"', '').replace('_', ' ')
            cur_triples.append((h, r, t))
        tgt = process_tgt_test(entry.lexs)
        src = process_src(cur_triples)
        wf_src.write(src + '\n')
        wf_tgt.write(json.dumps(tgt) + '\n')
    wf_tgt.close()
    wf_src.close()
    return eids


outdir = 'data/webnlg_tag'

b = Benchmark()
files = [('webnlg_challenge_2017/test', 'testdata_unseen_with_lex.xml')]
b.fill_benchmark(files)

pair_valid_src = os.path.join(outdir, "unseen.source")
pair_valid_tgt = os.path.join(outdir, "unseen.target")
eids = convert_dataset_test(pair_valid_src, pair_valid_tgt, b)
a = Benchmark()
files = [('webnlg_challenge_2017/test', 'testdata_with_lex.xml')]
a.fill_benchmark(files)
pair_valid_src = os.path.join(outdir, "seen.source")
pair_valid_tgt = os.path.join(outdir, "seen.target")
eids = convert_dataset_test_1(pair_valid_src, pair_valid_tgt, a, eids)

# files =[('webnlg_challenge_2017/test', 'testdata_with_lex.xml')]
# pair_valid_src = os.path.join(outdir, "test.source")
Beispiel #9
0
def generate_files():
    # generate files per category
    for cat in categories:
        b = Benchmark()
        b.fill_benchmark([(path, goldfile)])
        # print(cat + ': ' + str(b.entry_count(cat=cat)))
        b_reduced = b.filter([], [cat])
        # print('reduced', b_reduced.entry_count(cat=cat))

        # metric files generation; we use three references
        bleu_ref_files_gen(b_reduced, cat)
        # meteor_ref_files_gen(b_reduced, cat)
        meteor_3ref_files_gen(b_reduced, cat)
        # ter_ref_files_gen(b_reduced, cat)
        ter_ref_files_gen(b_reduced, cat, True)
        # ter_3ref_space_files_gen(b_reduced, cat)

    # generate files per size
    for size in range(1, 8):
        b = Benchmark()
        b.fill_benchmark([(path, goldfile)])
        # print(str(size) + ': ' + str(b.entry_count(size=str(size))))
        b_reduced = b.filter([size], [])
        # print('reduced', b_reduced.entry_count(size=str(size)))
        bleu_ref_files_gen(b_reduced, str(size) + 'size')
        # meteor_ref_files_gen(b_reduced, str(size) + 'size')
        meteor_3ref_files_gen(b_reduced, str(size) + 'size')
        # ter_ref_files_gen(b_reduced, str(size) + 'size')
        ter_ref_files_gen(b_reduced, str(size) + 'size', True)
        # ter_3ref_space_files_gen(b_reduced, str(size) + 'size')

    # generate files per type: old, new, all categories
    b = Benchmark()
    b.fill_benchmark([(path, goldfile)])
    print('Gold count', b.entry_count())
    # metric files generation for all cats
    bleu_ref_files_gen(b, 'all-cat')
    ter_3ref_space_files_gen(
        b, 'all-cat')  # need this format for significance testing
    # meteor_ref_files_gen(b, 'all-cat')
    meteor_3ref_files_gen(b, 'all-cat')
    # ter_ref_files_gen(b, 'all-cat')
    ter_ref_files_gen(b, 'all-cat', True)

    b_reduced = b.filter([], new_categories)
    print('reduced (new)', b_reduced.entry_count())
    # metric files generation for new cats
    bleu_ref_files_gen(b_reduced, 'new-cat')
    ter_3ref_space_files_gen(
        b_reduced, 'new-cat')  # need this format for significance testing
    # meteor_ref_files_gen(b_reduced, 'new-cat')
    meteor_3ref_files_gen(b_reduced, 'new-cat')
    # ter_ref_files_gen(b_reduced, 'new-cat')
    ter_ref_files_gen(b_reduced, 'new-cat', True)

    bk = Benchmark()
    bk.fill_benchmark([(path, goldfile)])
    bk_reduced = bk.filter([], old_categories)
    print('reduced (old)', bk_reduced.entry_count())
    # metric files generation for old cats
    bleu_ref_files_gen(bk_reduced, 'old-cat')
    ter_3ref_space_files_gen(
        bk_reduced, 'old-cat')  # need this format for significance testing
    # meteor_ref_files_gen(bk_reduced, 'old-cat')
    meteor_3ref_files_gen(bk_reduced, 'old-cat')
    # ter_ref_files_gen(bk_reduced, 'old-cat')
    ter_ref_files_gen(bk_reduced, 'old-cat', True)
Beispiel #10
0
def input_files(path,
                filepath=None,
                relex=False,
                parts=['train', 'dev'],
                doCategory=[],
                negraph=True,
                lowercased=True,
                fileid=None):
    """
    Read the corpus, write train and dev files.
    :param path: directory with the WebNLG benchmark
    :param filepath: path to the prediction file with sentences (for relexicalisation)
    :param relex: boolean; do relexicalisation or not
    :param parts: partition to process
    :param negraph: whether to add edges for multi-word entitites
    :param lowercased: whether to do all lowercased for the notdelex version of the files
    :return:
    """

    rplc_list_dev_delex = None
    options = ['all-delex',
               'all-notdelex']  # generate files with/without delexicalisation
    for part in parts:
        for option in options:
            if part.startswith('test'):
                files = select_files(path + part, size=0)
            else:
                files = select_files(path + part, size=(1, 8))
            if doCategory == UNSEEN_CATEGORIES:
                files = [files[1]]
            b = Benchmark()
            b.fill_benchmark(files)
            if option == 'all-delex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 part,
                                                 delex=True,
                                                 relex=relex,
                                                 doCategory=doCategory,
                                                 negraph=negraph,
                                                 lowercased=False)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), part, option))
            elif option == 'all-notdelex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 part,
                                                 delex=False,
                                                 relex=relex,
                                                 doCategory=doCategory,
                                                 negraph=negraph,
                                                 lowercased=lowercased)
                print('Total of {} files processed in {} with {} mode'.format(
                    len(files), part, option))
            if (part == 'dev'
                    or part.startswith('test')) and option == 'all-delex':
                rplc_list_dev_delex = rplc_list

    if relex and rplc_list_dev_delex:
        relexicalise(filepath,
                     rplc_list_dev_delex,
                     fileid,
                     part,
                     lowercased=lowercased)
    print('Files necessary for training/evaluating are written on disc.')
Beispiel #11
0
            entities[' '.join(word_tokenize(h.replace('_', ' ')))] = h
            entities[' '.join(word_tokenize(r.replace('_', ' ')))] = r
            all_e.append(h)
            all_e.append(r)
        tgt = process_tgt_test(entities, entry.lexs)
        if tgt == 0:
            continue
        src = process_src(cur_triples, majority.most_common(1)[0][0])
        wf_src.write(src + '\n')
        wf_tgt.write(json.dumps([tgt, all_e]) + '\n')
    wf_tgt.close()
    wf_src.close()


outdir = 'data/webnlg'
b = Benchmark()
files = select_files('webnlg_challenge_2017/train')
b.fill_benchmark(files)

pair_train_src = os.path.join(outdir, "pair_src.train")
pair_train_tgt = os.path.join(outdir, "pair_tgt.train")
convert_dataset(pair_train_src, pair_train_tgt, b)

b = Benchmark()
files = select_files('webnlg_challenge_2017/dev')
b.fill_benchmark(files)

pair_valid_src = os.path.join(outdir, "pair_src.valid")
pair_valid_tgt = os.path.join(outdir, "pair_tgt.valid")
convert_dataset(pair_valid_src, pair_valid_tgt, b)
def input_files(path,
                filepath=None,
                relex=False,
                parts=['train', 'dev'],
                doCategory=[],
                options=None,
                negraph=True,
                lowercased=True,
                fileid=None):
    """
    Read the corpus, write train and dev files.
    :param path: directory with the WebNLG benchmark
    :param filepath: path to the prediction file with sentences (for relexicalisation)
    :param relex: boolean; do relexicalisation or not
    :param parts: partition to process
    :param negraph: whether to add edges for multi-word entitites
    :param lowercased: whether to do all lowercased for the notdelex version of the files
    :return:
    """

    rplc_list_relex = None
    # options = ['all-delex', 'all-notdelex']  # generate files with/without delexicalisation
    if options is None:
        options = ['all-delex', 'all-notdelex'
                   ]  # generate files with/without delexicalisation
    else:
        assert isinstance(options, list)
        assert len(set(options) - {'all-notdelex', 'all-delex'}) == 0

    for part in parts:
        for option in options:
            files = select_files(path + part, size=(1, 8))
            b = Benchmark()
            b.fill_benchmark(files)
            if option == 'all-delex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 part,
                                                 delex=True,
                                                 relex=relex,
                                                 doCategory=doCategory,
                                                 negraph=negraph,
                                                 lowercased=lowercased,
                                                 is_train=(part == "train"))
                print('Total of {} instances processed in {} with {} mode'.
                      format(len(rplc_list), part, option))
            elif option == 'all-notdelex':
                rplc_list = create_source_target(b,
                                                 option,
                                                 part,
                                                 delex=False,
                                                 relex=relex,
                                                 doCategory=doCategory,
                                                 negraph=negraph,
                                                 lowercased=lowercased,
                                                 is_train=(part == "train"))
                print('Total of {} instances processed in {} with {} mode'.
                      format(len(rplc_list), part, option))
            if option == 'all-delex':
                rplc_list_relex = rplc_list

    if relex and rplc_list_relex:
        relexicalise(filepath,
                     rplc_list_relex,
                     fileid,
                     part,
                     lowercased=lowercased,
                     doCategory=doCategory)
    print('Files necessary for training/evaluating are written on disc.')