예제 #1
0
def main():
    dir = 'data-release/alignments'

    reader = AMR_Reader()
    dev_amrs = reader.load('data-release/amrs/leamr_dev.txt')
    test_amrs = reader.load('data-release/amrs/leamr_test.txt')
    all_amrs = reader.load('data-release/amrs/ldc+little_prince.txt')

    amr_map = {
        'leamr_dev': dev_amrs,
        'leamr_test': test_amrs,
        'ldc+little_prince': all_amrs
    }
    for filename in os.listdir(dir):
        file = os.path.join(dir, filename)
        if file.endswith('alignments.json') or file.endswith(
                'alignments.gold.json'):
            for k in amr_map:
                if filename.startswith(k):
                    amrs = amr_map[k]
                    aligns = load_from_json(file, amrs, unanonymize=True)
                    # run quick test
                    for amr in amrs:
                        for align in aligns[amr.id]:
                            for n in align.nodes:
                                if n not in amr.nodes:
                                    raise Exception(
                                        f'Failed to match alignments to AMR data. AMR "{amr.id}" has no node named "{n}".'
                                    )
                    # write output
                    write_to_json(file, aligns, amrs=amrs, anonymize=False)
                    break
예제 #2
0
def main():
    import argparse
    from amr_utils.amr_readers import AMR_Reader

    parser = argparse.ArgumentParser(description='Style AMRs as HTML or Latex')
    parser.add_argument('-f', '--files', type=str, nargs=2, required=True,
                        help='input and output files (AMRs in JAMR format)')
    parser.add_argument('--latex', action='store_true', help='style as latex')
    parser.add_argument('--html', action='store_true', help='style as html')

    args = parser.parse_args()
    file = args.files[0]
    outfile = args.files[1]

    cr = AMR_Reader(style='letters')
    amrs = cr.load(file, remove_wiki=True)

    if args.html:
        output = HTML_AMR.style(amrs)
        with open(outfile, 'w+', encoding='utf8') as f:
            f.write(output)
    else:
        output = Latex_AMR.style(amrs)
        with open(outfile, 'w+', encoding='utf8') as f:
            f.write(output)
예제 #3
0
파일: load_ccg.py 프로젝트: ablodge/leamr
def main():
    top_dir = sys.argv[1]
    parse_dir = sys.argv[2]
    dependency_dir = sys.argv[3]
    # os.chdir(top_dir)

    reader = AMR_Reader()
    amrs = reader.load('../data/split/train.txt', remove_wiki=True)
    amrs2 = reader.load('../data/split/dev.txt', remove_wiki=True)
    amrs3 = reader.load('../data/split/test.txt', remove_wiki=True)
    amr_ids = {'train':{' '.join(amr.tokens):amr.id for amr in amrs},
               'dev':{' '.join(amr.tokens):amr.id for amr in amrs2},
               'test':{' '.join(amr.tokens):amr.id for amr in amrs3}}
    # idx, deps = load_dependencies(r'C:\Users\Austin\OneDrive\Desktop\ccg rebank\data\PARG\00\wsj_0001.parg')
    # idx, deps = load_dependencies(r'C:\Users\Austin\OneDrive\Desktop\AMR-enhanced-alignments\data\test.ccg_dependencies.tsv', flavor='easysrl')

    ids = {}
    words = []
    for subdir_name in os.listdir(parse_dir):
        subdir = os.path.join(parse_dir, subdir_name)
        for file_name in os.listdir(subdir):
            file = os.path.join(subdir, file_name)
            idx, ccg_words, ccg_trees = load_ccgbank(file)
            for id, ws in zip(idx, ccg_words):
                ids[' '.join(w[1] for w in ws)] = id
            words.extend(ccg_words)

    with open('ids_map_train.tsv', 'w+', encoding='utf8') as f:
        for k in ['train']:
            common_sents = [(ids[i],amr_ids[k][i],i) for i in ids if i in amr_ids[k]]
            print(k, len(common_sents))
            for id1, id2, sent in common_sents:
                f.write(f'{id1}\t{id2}\t{sent}\n')
    with open('ids_map_test.tsv', 'w+', encoding='utf8') as f:
        for k in ['dev','test']:
            common_sents = [(ids[i],amr_ids[k][i],i) for i in ids if i in amr_ids[k]]
            print(k, len(common_sents))
            for id1, id2, sent in common_sents:
                f.write(f'{id1}\t{id2}\t{sent}\n')

    output_file = os.path.join(top_dir, 'ccgbank_parses.gold.txt')
    with open(output_file, 'w+', encoding='utf8') as fw:
        for subdir_name in os.listdir(parse_dir):
            subdir = os.path.join(parse_dir, subdir_name)
            for file_name in os.listdir(subdir):
                file = os.path.join(subdir, file_name)
                with open(file, 'r', encoding='utf') as fr:
                    s = fr.read()
                    fw.write(s)

    output_file = os.path.join(top_dir, 'ccgbank_dependencies.gold.txt')
    with open(output_file, 'w+', encoding='utf8') as fw:
        for subdir_name in os.listdir(dependency_dir):
            subdir = os.path.join(dependency_dir, subdir_name)
            for file_name in os.listdir(subdir):
                file = os.path.join(subdir, file_name)
                with open(file, 'r', encoding='utf') as fr:
                    s = fr.read()
                    fw.write(s)
    print()
def main():
    amr_file = sys.argv[1]
    alignment_file = sys.argv[2]
    relation_alignment_file = sys.argv[3]

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)

    subgraph_alignments = reader.load_alignments_from_json(alignment_file, amrs)
    relation_alignments = reader.load_alignments_from_json(relation_alignment_file, amrs)

    amrs = [amr for amr in amrs if amr.id in subgraph_alignments]
    for amr in amrs:
        amr.alignments = subgraph_alignments[amr.id]
    # random.shuffle(amrs)
    # amrs = amrs[:100]
    # print('Sampling AMRs:')
    # for amr in amrs:
    #     print(amr.id)

    output_file = amr_file.replace('.txt','.gold.txt')
    with open(output_file, 'w+', encoding='utf8') as f:
        for amr in amrs:
            f.write(amr.jamr_string())

    output_file2 = output_file.replace('.txt','.html')
    ID_Display.style(amrs, output_file2)

    output_file3 = output_file.replace('.gold.txt','.gold_alignments.tsv')
    with open(output_file3, 'w+', encoding='utf8') as f:
        for amr in amrs:
            f.write('\t'.join(['amr',str(amr.id)])+'\n')
            reentrancies = []
            for n in amr.nodes:
                parents = [(s,r,t) for s,r,t in amr.edges if t==n]
                if len(parents)>1:
                    reentrancies.extend(parents)
            node_labels = get_node_labels(amr)
            edge_labels = get_edge_labels(amr)
            f.write('\t'.join(['tokens']+[f'{i}={token}' for i,token in enumerate(amr.tokens)])+'\n')
            for n in amr.nodes:
                nalign = amr.get_alignment(subgraph_alignments, node_id=n)
                if nalign:
                    token_ids = nalign.tokens
                    token_ids = ','.join(str(t) for t in token_ids)
                    f.write('\t'.join(['node',node_labels[n], amr.nodes[n], token_ids])+'\n')
                else:
                    f.write('\t'.join(['node', node_labels[n], amr.nodes[n], '']) + '\n')
            for s,r,t in amr.edges:
                ealign = amr.get_alignment(relation_alignments, edge=(s,r,t))
                if ealign:
                    token_ids = ealign.tokens
                    token_ids = ','.join(str(t) for t in token_ids)
                    f.write('\t'.join(['edge', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}', token_ids])+'\n')
                else:
                    f.write('\t'.join(['edge', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}', '']) + '\n')
            for s,r,t in reentrancies:
                f.write('\t'.join(['reentrancy', edge_labels[(s,r,t)], f'{amr.nodes[s]} {r} {amr.nodes[t]}']) + '\n')
예제 #5
0
def main():
    file = sys.argv[1]
    align_file = sys.argv[2]
    outfile = sys.argv[3]

    reader = AMR_Reader()
    amrs = reader.load(file, remove_wiki=True)
    alignments = reader.load_alignments_from_json(align_file, amrs)
    style(amrs[:5000], alignments, outfile)
예제 #6
0
def main():

    amr_file = args.train

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)
    add_nlp_data(amrs, amr_file)

    eval_amr_file, eval_amrs, gold_eval_alignments = None, None, None
    if args.test:
        eval_amr_file, eval_align_file = args.test
        eval_amrs = reader.load(eval_amr_file, remove_wiki=True)
        add_nlp_data(eval_amrs, eval_amr_file)
        gold_eval_alignments = load_from_json(eval_align_file,
                                              eval_amrs,
                                              unanonymize=True)
        eval_amr_ids = {amr.id for amr in eval_amrs}
        amrs = [amr for amr in amrs if amr.id not in eval_amr_ids]
    # amrs = amrs[:1000]

    if args.load_model:
        print('Loading model from:', args.load_model)
        align_model = Subgraph_Model.load_model(args.load_model)
    else:
        align_model = Subgraph_Model(amrs, align_duplicates=True)

    iters = args.iter

    alignments = None
    for i in range(iters):
        print(f'Epoch {i}: Training data')
        alignments = align_model.align_all(amrs)
        align_model.update_parameters(amrs, alignments)
        perplexity(align_model, amrs, alignments)
        report_progress(amr_file, alignments, reader, epoch=i)
        print()

        if eval_amrs:
            print(f'Epoch {i}: Evaluation data')
            eval_alignments = align_model.align_all(eval_amrs)
            perplexity(align_model, eval_amrs, eval_alignments)
            evaluate(eval_amrs, eval_alignments, gold_eval_alignments)
            evaluate_duplicates(eval_amrs, eval_alignments,
                                gold_eval_alignments)
            report_progress(eval_amr_file, eval_alignments, reader, epoch=i)
            print()

    report_progress(amr_file, alignments, reader)

    if args.save_model:
        align_model.save_model(args.save_model)
        print('Saving model to:', args.save_model)
예제 #7
0
def main():
    file = '../data/szubert/szubert_amrs.isi_alignments.txt'
    ids_file = '../data/szubert/szubert_ids.isi.txt'
    output = '../data/szubert/szubert_amrs.isi.txt'

    amr_file1 = '../data/ldc_train.txt'
    amr_file2 = '../data/szubert/szubert_amrs.txt'
    reader = AMR_Reader()
    amrs = reader.load(amr_file1, remove_wiki=True)
    szubert_amrs = reader.load(amr_file2, remove_wiki=True)
    szubert_amr_ids = [amr.id for amr in szubert_amrs]
    amrs += szubert_amrs
    amrs = {amr.id: amr for amr in amrs}

    amr_ids = []
    with open(ids_file, encoding='utf8') as f:
        for line in f:
            if line:
                amr_ids.append(line.strip())

    isi_amrs, isi_alignments = reader.load(file, output_alignments=True)

    subgraph_alignments = {}
    relation_alignments = {}
    for isi_amr in isi_amrs:
        if isi_amr.id not in szubert_amr_ids: continue
        amr = amrs[isi_amr.id]
        if len(amr.tokens) != len(isi_amr.tokens):
            raise Exception('Inconsistent Tokenization:', amr.id)
        node_labels = node_map(isi_amr, amr)
        edge_labels = edge_map(isi_amr, amr)
        isi_aligns = isi_alignments[amr.id]
        subgraph_alignments[amr.id] = []
        relation_alignments[amr.id] = []
        for i, tok in enumerate(amr.tokens):
            aligns = [align for align in isi_aligns if i in align.tokens]
            nodes = [node_labels[n] for align in aligns for n in align.nodes]
            edges = [edge_labels[e] for align in aligns for e in align.edges]
            subgraph_alignments[amr.id].append(
                AMR_Alignment(type='subgraph', tokens=[i], nodes=nodes))
            relation_alignments[amr.id].append(
                AMR_Alignment(type='relation', tokens=[i], edges=edges))
    reader.save_alignments_to_json(
        output.replace('.txt', '.subgraph_alignments.json'),
        subgraph_alignments)
    reader.save_alignments_to_json(
        output.replace('.txt', '.relation_alignments.json'),
        relation_alignments)

    for amr in szubert_amrs:
        if amr.id not in subgraph_alignments:
            raise Exception('Missing AMR:', amr.id)
예제 #8
0
def main():
    file = 'data/szubert/szubert_amrs.txt'
    output = 'data/szubert/szubert_amrs.jamr.txt'

    reader = AMR_Reader()
    amrs = reader.load(file, remove_wiki=True)

    with open(output, 'w+') as f:
        for amr in amrs:
            f.write('# ::id '+amr.id+'\n')
            tokens = [t for t in amr.tokens]
            for i,t in enumerate(tokens):
                if t[0]=='@' and t[-1]=='@' and len(t)==3:
                    tokens[i] = t[1]
            f.write('# ::snt ' + ' '.join(tokens) + '\n')
            graph_string = amr.graph_string().replace('/',' / ')
            f.write(graph_string)
예제 #9
0
def main():
    file = '../data/split/train.txt'
    file2 = '../data/train.sents.txt'

    reader = AMR_Reader()
    amrs = reader.load(file, remove_wiki=True)
    add_nlp_data(amrs, file)

    # amrs2 = reader.load('../data/split/test.txt', remove_wiki=True)
    # add_nlp_data(amrs2, '../data/split/test.txt')
    # amrs = amrs+amrs2

    with open(file2, 'w+', encoding='utf8') as f:
        for amr in amrs:
            for token, pos in zip(amr.tokens, amr.pos):
                f.write(f'{token}|{pos} ')
            f.write('\n')
def load_szubert_data(amr_file):

    # Szubert data
    reader = AMR_Reader()
    amrs1 = reader.load(amr_file, remove_wiki=True)
    amr_ids = [amr.id for amr in amrs1]
    for amr_id in amr_ids:
        if amr_ids.count(amr_id)>1:
            print('Repeated:', amr_id)

    # LDC data
    amrs2 = []
    amrs2 += reader.load('data/ldc_train.txt', remove_wiki=True)
    amrs2 += reader.load('data/ldc_dev.txt', remove_wiki=True)
    amrs2 += reader.load('data/ldc_test.txt', remove_wiki=True)
    amrs = [amr for amr in amrs2 if amr.id in amr_ids]
    ldc_ids = [amr.id for amr in amrs]

    # Little Prince data
    amrs3 = reader.load('data/little_prince.txt', remove_wiki=True)
    little_prince_ids = [amr.id for amr in amrs3 if amr.id in amr_ids]
    amrs += [amr for amr in amrs3 if amr.id in little_prince_ids]

    # other data
    other_ids = [amr_id for amr_id in amr_ids if amr_id not in ldc_ids and amr_id not in little_prince_ids]
    amrs += [amr for amr in amrs1 if amr.id in other_ids]

    print('Missing:', ' '.join(i for i in other_ids))
    print(len(amrs), '/', len(amrs1), 'AMRs printed')

    return amrs
예제 #11
0
def main():
    amr_file = sys.argv[1]
    align_file = sys.argv[2]
    gold_file = sys.argv[3]

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)
    add_nlp_data(amrs, amr_file)

    alignments = reader.load_alignments_from_json(align_file, amrs)
    gold_alignments = reader.load_alignments_from_json(gold_file, amrs)
    pred_subgraph_alignments = reader.load_alignments_from_json(
        align_file.replace('relation_', 'subgraph_'), amrs)
    gold_subgraph_alignments = reader.load_alignments_from_json(
        gold_file.replace('relation_', 'subgraph_'), amrs)

    # Display.style([amr for amr in amrs if amr.id in gold_alignments],
    #               gold_file.replace('.json', '') + f'.html',
    #               gold_alignments)

    if len(amrs) != len(alignments):
        amrs = [
            amr for amr in amrs
            if amr.id in alignments and amr.id in gold_alignments
        ]
    evaluate(amrs, alignments, gold_alignments, mode='edges')
예제 #12
0
def load_data4():
    amr_file1 = '../data/split/dev.txt'
    amr_file2 = '../data/split/test.txt'
    ccg_dependency_file = '../data/ccg/ccgbank_dependencies.gold.txt'
    ccgbank_file = '../data/ccg/ccgbank_parses.gold.txt'
    ids_file = '../data/ccg/ids_map_test.tsv'

    reader = AMR_Reader()
    amrs = reader.load(amr_file1, remove_wiki=True)
    add_nlp_data(amrs, amr_file1)
    amrs2 = reader.load(amr_file2, remove_wiki=True)
    add_nlp_data(amrs2, amr_file2)
    amrs += amrs2

    # gold data
    align_file = amr_file1.replace('.txt',
                                   '') + '.subgraph_alignments.gold.json'
    subgraph_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file1.replace('.txt',
                                   '') + '.relation_alignments.gold.json'
    relation_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file1.replace('.txt',
                                   '') + '.reentrancy_alignments.gold.json'
    reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file2.replace('.txt',
                                   '') + '.subgraph_alignments.gold.json'
    subgraph_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))
    align_file = amr_file2.replace('.txt',
                                   '') + '.relation_alignments.gold.json'
    relation_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))
    align_file = amr_file2.replace('.txt',
                                   '') + '.reentrancy_alignments.gold.json'
    reentrancy_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))

    ids, dependencies, ccg_lex, ccg_trees = load_gold_ccgs(
        ids_file, ccg_dependency_file, ccgbank_file)
    amrs = {amr.id: amr for amr in amrs}
    amrs2 = []
    for id in ids:
        amrs2.append(amrs[id])
    amrs = amrs2
    return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
예제 #13
0
def load_data2():
    amr_file1 = '../data/split/dev.txt'
    amr_file2 = '../data/split/test.txt'
    ccg_dependency_file = '../data/test.ccg_dependencies.tsv'
    ccgbank_file = '../data/test.ccg_parse.txt'

    reader = AMR_Reader()
    amrs = reader.load(amr_file1, remove_wiki=True)
    add_nlp_data(amrs, amr_file1)
    amrs2 = reader.load(amr_file2, remove_wiki=True)
    add_nlp_data(amrs2, amr_file2)
    amrs += amrs2

    # gold data
    align_file = amr_file1.replace('.txt',
                                   '') + '.subgraph_alignments.gold.json'
    subgraph_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file1.replace('.txt',
                                   '') + '.relation_alignments.gold.json'
    relation_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file1.replace('.txt',
                                   '') + '.reentrancy_alignments.gold.json'
    reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file2.replace('.txt',
                                   '') + '.subgraph_alignments.gold.json'
    subgraph_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))
    align_file = amr_file2.replace('.txt',
                                   '') + '.relation_alignments.gold.json'
    relation_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))
    align_file = amr_file2.replace('.txt',
                                   '') + '.reentrancy_alignments.gold.json'
    reentrancy_alignments.update(
        reader.load_alignments_from_json(align_file, amrs))

    sentences = [amr.tokens for amr in amrs]
    _, dependencies = align_dependencies_to_sentences(
        load_dependencies(ccg_dependency_file, flavor='easysrl'), sentences)
    _, ccg_lex, ccg_trees = align_ccgbank_to_sentences(
        load_ccgbank(ccgbank_file), sentences)

    return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
예제 #14
0
def main():
    file = 'data/szubert/szubert_amrs.txt'
    file2 = 'data/ldc_train.txt'

    output1 = 'data/szubert/szubert_sents.isi.txt'
    output2 = 'data/szubert/szubert_amrs.isi.txt'
    output3 = 'data/szubert/szubert_ids.isi.txt'

    reader = AMR_Reader()
    amrs = reader.load(file, remove_wiki=True)
    amrs += reader.load(file2, remove_wiki=True)
    unique_ids = set()
    amrs2 = []
    for amr in amrs:
        if amr.id not in unique_ids:
            unique_ids.add(amr.id)
            amrs2.append(amr)
    amrs = amrs2

    with open(output1, 'w+', encoding='utf8') as f:
        for amr in amrs:
            tokens = [t for t in amr.tokens]
            for i, t in enumerate(tokens):
                if t[0] == '@' and t[-1] == '@' and len(t) == 3:
                    tokens[i] = t[1]
            f.write(' '.join(tokens) + '\n')
    with open(output2, 'w+', encoding='utf8') as f:
        for amr in amrs:
            graph_string = amr.graph_string()\
                .replace('/', ' / ')\
                .replace('\n','')\
                .replace('\r','') \
                .replace('\t:', ' :') \
                .replace('\t','')
            f.write(graph_string+'\n')
    with open(output3, 'w+', encoding='utf8') as f:
        for amr in amrs:
            f.write(amr.id+'\n')
예제 #15
0
def main():
    amr_file = 'data/szubert/szubert_amrs.jamr_alignments.txt'
    # amr_file2 = 'data/szubert/szubert_amrs.txt'

    reader = AMR_Reader()
    amrs, alignments = reader.load(amr_file,
                                   remove_wiki=True,
                                   output_alignments=True)
    for amr in amrs:
        spans = set()
        taken = set()
        for align in alignments[amr.id]:
            align.type = 'subgraph'
            align.amr = amr
            spans.add(tuple(align.tokens))
            taken.update(align.tokens)
        for t in range(len(amr.tokens)):
            if t not in taken:
                spans.add((t, ))
        spans = [list(span) for span in sorted(spans, key=lambda x: x[0])]
        clean_alignments(amr, alignments, spans)

    reader.save_alignments_to_json(
        'data/szubert/szubert_amrs.jamr_alignments.json', alignments)
예제 #16
0
def main():
    unaligned_amr_file = args.test

    reader = AMR_Reader()

    eval_amrs = reader.load(unaligned_amr_file, remove_wiki=True)
    add_nlp_data(eval_amrs, unaligned_amr_file)

    # subgraphs
    print(f'Loading model: {args.subgraph_model}')
    subgraph_model = Subgraph_Model.load_model(args.subgraph_model)

    sub_alignments = subgraph_model.align_all(eval_amrs)
    align_file = unaligned_amr_file.replace('.txt', '') + f'.subgraph_alignments.json'
    print(f'Writing subgraph alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, sub_alignments)
    
    # relations
    print(f'Loading model: {args.relation_model}')
    rel_model = Relation_Model.load_model(args.relation_model)
    rel_model.subgraph_alignments = sub_alignments

    rel_alignments = rel_model.align_all(eval_amrs)
    align_file = unaligned_amr_file.replace('.txt', '') + f'.relation_alignments.json'
    print(f'Writing relation alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, rel_alignments)

    # reentrancies
    print(f'Loading model: {args.reentrancy_model}')
    reent_model = Reentrancy_Model.load_model(args.reentrancy_model)
    reent_model.subgraph_alignments = sub_alignments
    reent_model.relation_alignments = rel_alignments

    reent_alignments = reent_model.align_all(eval_amrs)
    align_file = unaligned_amr_file.replace('.txt', '') + f'.reentrancy_alignments.json'
    print(f'Writing reentrancy alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, reent_alignments)
예제 #17
0
def main():
    amr_file = sys.argv[1]
    output_file = sys.argv[2]

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)

    # subgraphs
    align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json'
    sub_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs:
        sub_aligns[amr.id] = [a for a in sub_aligns[amr.id] if a.nodes]
    align_file = output_file.replace('.txt', '') + '.subgraph_alignments.json'
    print('Writing subgraph alignments to:', align_file)
    write_to_json(align_file, sub_aligns, anonymize=True, amrs=amrs)

    # relations
    align_file = amr_file.replace('.txt', '') + '.relation_alignments.json'
    rel_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs:
        rel_aligns[amr.id] = [a for a in rel_aligns[amr.id] if a.edges]
    align_file = output_file.replace('.txt', '') + '.relation_alignments.json'
    print('Writing relation alignments to:', align_file)
    write_to_json(align_file, rel_aligns, anonymize=True, amrs=amrs)

    # reentrancies
    align_file = amr_file.replace('.txt', '') + '.reentrancy_alignments.json'
    reent_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs:
        reent_aligns[amr.id] = [a for a in reent_aligns[amr.id] if a.edges]
    align_file = output_file.replace('.txt',
                                     '') + '.reentrancy_alignments.json'
    print('Writing reentrancy alignments to:', align_file)
    write_to_json(align_file, reent_aligns, anonymize=True, amrs=amrs)

    for amr in amrs:
        for n in amr.nodes:
            n_aligned = [a for a in sub_aligns[amr.id] if n in a.nodes]
            if len(n_aligned) != 1:
                raise Exception('Bad node alignment', amr.id, n)
        for e in amr.edges:
            e_aligned = [a for a in sub_aligns[amr.id] if e in a.edges]+\
                        [a for a in rel_aligns[amr.id] if e in a.edges]
            if len(e_aligned) != 1:
                raise Exception('Bad edge alignment', amr.id, e)
예제 #18
0
def load_data1():
    amr_file = '../data/split/train.txt'
    ccg_dependency_file = '../data/train.ccg_dependencies.tsv'
    ccgbank_file = '../data/train.ccg_parse.txt'

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)
    add_nlp_data(amrs, amr_file)

    # predicted data
    align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json'
    subgraph_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file.replace('.txt', '') + '.relation_alignments.json'
    relation_alignments = reader.load_alignments_from_json(align_file, amrs)
    align_file = amr_file.replace('.txt', '') + '.reentrancy_alignments.json'
    reentrancy_alignments = reader.load_alignments_from_json(align_file, amrs)

    sentences = [amr.tokens for amr in amrs]
    _, dependencies = align_dependencies_to_sentences(
        load_dependencies(ccg_dependency_file, flavor='easysrl'), sentences)
    _, ccg_lex, ccg_trees = align_ccgbank_to_sentences(
        load_ccgbank(ccgbank_file), sentences)

    return amrs, subgraph_alignments, relation_alignments, reentrancy_alignments, dependencies, ccg_lex, ccg_trees
예제 #19
0
def main():
    amr_file = args.train

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)
    add_nlp_data(amrs, amr_file)

    align_file = amr_file.replace('.txt', '') + '.subgraph_alignments.json'
    subgraph_alignments = reader.load_alignments_from_json(align_file, amrs)
    # amrs = amrs[:1000]

    eval_amr_file, eval_amrs, gold_eval_alignments = None, None, None
    if args.test:
        eval_amr_file, eval_align_file = args.test
        eval_amrs = reader.load(eval_amr_file, remove_wiki=True)
        add_nlp_data(eval_amrs, eval_amr_file)
        gold_eval_alignments = reader.load_alignments_from_json(
            eval_align_file, eval_amrs)
        eval_amr_ids = {amr.id for amr in eval_amrs}
        amrs = [amr for amr in amrs if amr.id not in eval_amr_ids]

        align_file = eval_amr_file.replace(
            '.txt', '') + '.subgraph_alignments.gold.json'
        gold_subgraph_alignments = reader.load_alignments_from_json(
            align_file, eval_amrs)
        align_file = eval_amr_file.replace('.txt',
                                           '') + '.subgraph_alignments.json'
        pred_subgraph_alignments = reader.load_alignments_from_json(
            align_file, eval_amrs)
        if USE_GOLD_SUBGRAPHS:
            pred_subgraph_alignments = gold_subgraph_alignments
        for amr_id in pred_subgraph_alignments:
            subgraph_alignments[amr_id] = pred_subgraph_alignments[amr_id]
        for amr in eval_amrs:
            spans = [
                align.tokens for align in pred_subgraph_alignments[amr.id]
                if align.type == 'subgraph'
            ]
            amr.spans = spans

    if args.load_model:
        print('Loading model from:', args.load_model)
        align_model = Relation_Model.load_model(args.load_model)
    else:
        align_model = Relation_Model(amrs, subgraph_alignments)

    iters = args.iter

    alignments = None
    for i in range(iters):
        print(f'Epoch {i}: Training data')
        alignments = align_model.align_all(amrs)
        align_model.update_parameters(amrs, alignments)
        report_progress(amr_file, alignments, reader, epoch=i)
        perplexity(align_model, amrs, alignments)
        print()

        if eval_amrs:
            print(f'Epoch {i}: Evaluation data')
            eval_alignments = align_model.align_all(eval_amrs)
            perplexity(align_model, eval_amrs, eval_alignments)
            evaluate_relations(eval_amrs, eval_alignments,
                               gold_eval_alignments, pred_subgraph_alignments,
                               gold_subgraph_alignments)
            report_progress(eval_amr_file, eval_alignments, reader, epoch=i)
            print()

    report_progress(amr_file, alignments, reader)

    if args.save_model:
        align_model.save_model(args.save_model)
        print('Saving model to:', args.save_model)
예제 #20
0
def main():
    amr_file_old = sys.argv[1]
    amr_file_new = sys.argv[2]
    output_file = sys.argv[3]

    reader = AMR_Reader()
    amrs_old = reader.load(amr_file_old)
    amrs_new = reader.load(amr_file_new, remove_wiki=True)

    bad_node_map = {}
    for amr1 in amrs_new:
        amr2 = next(a for a in amrs_old if a.id == amr1.id)
        for n in amr1.nodes:
            amr1.nodes[n] = amr1.nodes[n].replace('"', '')
        for n in amr2.nodes:
            amr2.nodes[n] = amr2.nodes[n].replace('"', '')
        bad_nodes = []
        for n in amr1.nodes:
            if n not in amr2.nodes or amr1.nodes[n] != amr2.nodes[n]:
                bad_nodes.append(n)
                continue
            neighborhood = {
                f'{amr1.nodes[e[0]]} {e[1]} {amr1.nodes[e[2]]}'
                for e in amr1.edges if n in e
            }
            neighborhood2 = {
                f'{amr2.nodes[e[0]]} {e[1]} {amr2.nodes[e[2]]}'
                for e in amr2.edges if n in e
            }
            if neighborhood != neighborhood2:
                bad_nodes.append(n)
        if bad_nodes:
            bad_node_map[amr1.id] = {}
            for n in amr1.nodes:
                if n in bad_nodes:
                    new_n = [
                        n2 for n2 in amr2.nodes
                        if amr2.nodes[n2] == amr1.nodes[n]
                        and not (n2 in amr1.nodes
                                 and amr1.nodes[n2] == amr1.nodes[n])
                    ]
                    neighborhood = {
                        f'{amr1.nodes[e[0]]} {e[1]} {amr1.nodes[e[2]]}'
                        for e in amr1.edges if n in e
                    }
                    if len(new_n) > 1:
                        neighborhood2 = {
                            n2: [
                                f'{amr2.nodes[e[0]]} {e[1]} {amr2.nodes[e[2]]}'
                                for e in amr2.edges if n2 in e
                            ]
                            for n2 in new_n
                        }
                        new_n = [
                            n2 for n2 in new_n
                            if neighborhood == set(neighborhood2[n2])
                        ]
                    if len(new_n) == 1:
                        bad_node_map[amr1.id][new_n[0]] = n
                    else:
                        raise Exception('Bad node match', amr1.id, n)
                else:
                    bad_node_map[amr1.id][n] = n
    for amr1 in amrs_new:
        amr2 = next(a for a in amrs_old if a.id == amr1.id)
        for n2 in amr2.nodes:
            n = bad_node_map[amr1.id][n2] if amr1.id in bad_node_map else n2
            if amr1.nodes[n] != amr2.nodes[n2]:
                raise Exception('Bad node match', amr1.id, n)
        for e in amr2.edges:
            s2, r, t2 = e
            s = bad_node_map[amr1.id][s2] if amr1.id in bad_node_map else s2
            t = bad_node_map[amr1.id][t2] if amr1.id in bad_node_map else t2
            if (s, r, t) not in amr1.edges:
                raise Exception('Bad edge match', amr1.id, e, amr1.nodes[s],
                                r.amr1.nodes[t])

    print('Node id fixes:', len(bad_node_map), ' '.join(bad_node_map.keys()))
    # subgraphs
    align_file = amr_file_old.replace('.txt',
                                      '') + '.subgraph_alignments.gold.json'
    sub_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs_new:
        for align in sub_aligns[amr.id]:
            if amr.id in bad_node_map:
                align.nodes = [bad_node_map[amr.id][n] for n in align.nodes]
        for align in sub_aligns[amr.id]:
            if len(align.nodes) > 1:
                for s, r, t in amr.edges:
                    if s in align.nodes and t in align.nodes:
                        align.edges.append((s, r, t))
        sub_aligns[amr.id] = [a for a in sub_aligns[amr.id] if a.nodes]
    align_file = output_file.replace('.txt',
                                     '') + '.subgraph_alignments.gold.json'
    print('Writing subgraph alignments to:', align_file)
    write_to_json(align_file, sub_aligns, anonymize=True, amrs=amrs_new)

    # relations
    align_file = amr_file_old.replace('.txt',
                                      '') + '.relation_alignments.gold.json'
    rel_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs_new:
        for align in rel_aligns[amr.id]:
            if amr.id in bad_node_map:
                align.edges = [(bad_node_map[amr.id][s], r,
                                bad_node_map[amr.id][t])
                               for s, r, t in align.edges]
        for align in rel_aligns[amr.id]:
            sub_align = amr.get_alignment(sub_aligns, token_id=align.tokens[0])
            if sub_align.nodes:
                align.edges = [
                    e for e in align.edges if not (
                        e[0] in sub_align.nodes and e[-1] in sub_align.nodes)
                ]
        rel_aligns[amr.id] = [a for a in rel_aligns[amr.id] if a.edges]
    align_file = output_file.replace('.txt',
                                     '') + '.relation_alignments.gold.json'
    print('Writing relation alignments to:', align_file)
    write_to_json(align_file, rel_aligns, anonymize=True, amrs=amrs_new)

    # reentrancies
    align_file = amr_file_old.replace('.txt',
                                      '') + '.reentrancy_alignments.gold.json'
    reent_aligns = reader.load_alignments_from_json(align_file)
    for amr in amrs_new:
        reent_aligns[amr.id] = [a for a in reent_aligns[amr.id] if a.edges]
        for align in reent_aligns[amr.id]:
            if amr.id in bad_node_map:
                align.edges = [(bad_node_map[amr.id][s], r,
                                bad_node_map[amr.id][t])
                               for s, r, t in align.edges]
    align_file = output_file.replace('.txt',
                                     '') + '.reentrancy_alignments.gold.json'
    print('Writing reentrancy alignments to:', align_file)
    write_to_json(align_file, reent_aligns, anonymize=True, amrs=amrs_new)

    for amr in amrs_new:
        for n in amr.nodes:
            n_aligned = [a for a in sub_aligns[amr.id] if n in a.nodes]
            if len(n_aligned) != 1:
                raise Exception('Bad node alignment', amr.id, n)
        for e in amr.edges:
            e_aligned = [a for a in sub_aligns[amr.id] if e in a.edges]+\
                        [a for a in rel_aligns[amr.id] if e in a.edges]
            if len(e_aligned) != 1:
                raise Exception('Bad edge alignment', amr.id, e)
예제 #21
0
def main():
    amr_file = sys.argv[1]
    # output_file = sys.argv[2]

    # stanza.download('en')
    nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)

    lemmas_json = {}
    pos_json = {}

    enum_amrs = [_ for _ in enumerate(amrs)]
    for amr_idx, amr in tqdm(enum_amrs):
        tokens = amr.tokens.copy()
        for i, tok in enumerate(tokens):
            if tok.startswith('@') and tok.endswith('@') and len(tok) == 3:
                tokens[i] = tok[1]
        doc = nlp(' '.join(tokens))
        start_idx = {}
        end_idx = {}
        i = 0
        for j, tok in enumerate(tokens):
            start_idx[j] = i
            end_idx[j] = i + len(tok)
            i += len(tok) + 1

        convert_ids = {}
        stanza_lemmas = {}
        stanza_pos = {}
        for s in doc.sentences:
            for token in s.tokens:
                start = token.start_char
                end = token.end_char
                idx = [k for k in start_idx if start >= start_idx[k] and end <= end_idx[k]]
                if len(idx) == 0:
                    idx = [k for k in start_idx if start <= start_idx[k] <= end]
                idx = idx[0]
                convert_ids[start] = idx
                for word in token.words:
                    if start not in stanza_lemmas:
                        stanza_lemmas[start] = ''
                    lemma = word.lemma
                    stanza_lemmas[start] += lemma
                    stanza_pos[start] = word.xpos

        lemmas = ['' for _ in amr.tokens]
        pos = ['' for _ in amr.tokens]
        for i in stanza_lemmas:
            lemmas[convert_ids[i]] += stanza_lemmas[i]
            pos[convert_ids[i]] = stanza_pos[i]
        for i, l in enumerate(lemmas):
            if not l and i > 0:
                lemmas[i] = lemmas[i - 1]
                pos[i] = pos[i - 1]

        lemmas_json[amr.id] = lemmas
        pos_json[amr.id] = pos

    filename = amr_file.replace('.txt', '')
    with open(filename + '.lemmas.json', 'w+', encoding='utf8') as f:
        json.dump(lemmas_json, f)
    with open(filename + '.pos.json', 'w+', encoding='utf8') as f:
        json.dump(pos_json, f)
예제 #22
0
def main():

    split_file = Path('../data/split/train_ids.txt')
    if split_file.is_file():
        raise Exception(
            'Cannot create Train, Dev, Test split because split already exists:',
            str(split_file.resolve()))

    reader = AMR_Reader()

    ldc_train = '../data/ldc_train.txt'
    ldc_train = reader.load(ldc_train, remove_wiki=True)
    ldc_dev = '../data/ldc_dev.txt'
    ldc_dev = reader.load(ldc_dev, remove_wiki=True)
    ldc_test = '../data/ldc_test.txt'
    ldc_test = reader.load(ldc_test, remove_wiki=True)

    little_prince = '../data/little_prince.txt'
    little_prince = reader.load(little_prince, remove_wiki=True)
    szubert = '../data/szubert/szubert_amrs.txt'
    szubert = reader.load(szubert, remove_wiki=True)
    gold_dev = '../data/gold_dev/ldc_dev.gold.txt'
    gold_dev = reader.load(gold_dev, remove_wiki=True)

    szubert_ids = [amr.id for amr in szubert]
    train_ids = [amr.id for amr in ldc_train if amr.id not in szubert_ids]
    little_prince_ids = [
        amr.id for amr in little_prince if amr.id not in szubert_ids
    ]
    gold_dev_ids = [amr.id for amr in gold_dev if amr.id not in szubert_ids]

    shuffle(little_prince_ids)
    sample = little_prince_ids[:50]

    new_train_ids = train_ids + [
        n for n in little_prince_ids if n not in sample
    ]
    new_dev_ids = gold_dev_ids + sample
    new_test_ids = szubert_ids

    little_prince1 = [
        amr.id for amr in little_prince if amr.id in new_train_ids
    ]
    little_prince2 = [amr.id for amr in little_prince if amr.id in new_dev_ids]
    little_prince3 = [
        amr.id for amr in little_prince if amr.id in new_test_ids
    ]
    print('Split up little prince:', len(little_prince1), len(little_prince2),
          len(little_prince3))

    with open('../data/split/train_ids.txt', 'w+') as f:
        f.write(f'# {len(new_train_ids)} train AMRs\n')
        for n in sorted(new_train_ids):
            f.write(n + '\n')

    with open('../data/split/dev_ids.txt', 'w+') as f:
        f.write(f'# {len(new_dev_ids)} dev AMRs\n')
        for n in sorted(new_dev_ids):
            f.write(n + '\n')

    with open('../data/split/test_ids.txt', 'w+') as f:
        f.write(f'# {len(new_test_ids)} test AMRs\n')
        for n in sorted(new_test_ids):
            f.write(n + '\n')

    train_amrs = {amr.id: amr for amr in ldc_train + little_prince}
    reader.write_to_file('../data/split/train.txt',
                         [train_amrs[n] for n in sorted(new_train_ids)])
    dev_amrs = {amr.id: amr for amr in ldc_dev + little_prince}
    reader.write_to_file('../data/split/dev.txt',
                         [dev_amrs[n] for n in sorted(new_dev_ids)])
    test_amrs = {amr.id: amr for amr in szubert}
    reader.write_to_file('../data/split/test.txt',
                         [test_amrs[n] for n in sorted(new_test_ids)])
예제 #23
0
파일: nlp_data.py 프로젝트: ablodge/leamr
def main():
    amr_file = sys.argv[1]
    # output_file = sys.argv[2]

    # stanza.download('en')
    nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,ner')

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)

    lemmas_json = {}
    pos_json = {}
    ner_spans = {}
    mwe_spans = {}
    multi_word_spans = {}
    coreferences = {}

    mwe_types = get_mwe_types_by_first_token()
    coref_parser = None
    try:
        coref_parser = get_coref_parser()
    except Exception as e:
        print(
            'Warning: Failed to parse coreference. '
            'Please install neuralcoref from source: https://github.com/huggingface/neuralcoref#install-neuralcoref-from-source',
            file=sys.stderr)

    enum_amrs = [_ for _ in enumerate(amrs)]
    for amr_idx, amr in tqdm(enum_amrs):
        tokens = amr.tokens.copy()
        for i, tok in enumerate(tokens):
            if tok.startswith('@') and tok.endswith('@') and len(tok) == 3:
                tokens[i] = tok[1]
        doc = nlp(' '.join(tokens))
        start_idx = {}
        end_idx = {}
        i = 0
        for j, tok in enumerate(tokens):
            start_idx[j] = i
            end_idx[j] = i + len(tok)
            i += len(tok) + 1

        convert_ids = {}
        stanza_lemmas = {}
        stanza_entity_type = []
        stanza_entity_spans = []
        stanza_pos = {}
        for s in doc.sentences:
            for token in s.tokens:
                start = token.start_char
                end = token.end_char
                idx = [
                    k for k in start_idx
                    if start >= start_idx[k] and end <= end_idx[k]
                ]
                if len(idx) == 0:
                    idx = [
                        k for k in start_idx if start <= start_idx[k] <= end
                    ]
                idx = idx[0]
                convert_ids[start] = idx
                for word in token.words:
                    if start not in stanza_lemmas:
                        stanza_lemmas[start] = ''
                    lemma = word.lemma
                    stanza_lemmas[start] += lemma
                    stanza_pos[start] = word.xpos
            for e in s.entities:
                stanza_entity_type.append(e.type)
                ent_type = e.type
                span = []
                for t in e.tokens:
                    start = t.start_char
                    span.append(start)
                # name = ' '.join(amr.tokens[convert_ids[t]] for t in span)
                # type = e.type
                pos = [stanza_pos[t] for t in span]
                if pos[0] in [
                        'DT', 'PDT', 'PRP$', 'RB', 'RP', 'JJ', 'JJR', 'JJS',
                        'IN'
                ]:
                    while pos and pos[0] in [
                            'DT', 'PDT', 'PRP$', 'RB', 'RP', 'JJ', 'JJR',
                            'JJS', 'IN'
                    ]:
                        pos = pos[1:]
                        span = span[1:]
                    if len(span) == 0:
                        stanza_entity_type.pop()
                        continue
                if pos and pos[-1] in ['POS', 'RB', 'RBR', 'RBS']:
                    span = span[:-1]
                    if len(span) == 0:
                        stanza_entity_type.pop()
                        continue
                # next_tok = convert_ids[span[-1]]
                # next_tok = [s for s,t in convert_ids.items() if t==next_tok+1]
                # prev_tok = convert_ids[span[0]]
                # prev_tok = [s for s, t in convert_ids.items() if t == prev_tok - 1]
                # if next_tok:
                #     next_tok = next_tok[0]
                #     next_pos = stanza_pos[next_tok]
                #     if next_pos == 'NNP':
                #         span.append(next_tok)
                # if prev_tok:
                #     prev_tok = prev_tok[0]
                #     prev_pos = stanza_pos[prev_tok]
                #     if prev_pos == 'NNP':
                #         span.insert(0,prev_tok)
                # if amr.id=='bolt12_10494_3592.5':
                #     print()
                if len(span) == 1:
                    stanza_entity_type.pop()
                    continue
                stanza_entity_spans.append(span)
                # if ent_type in ['DATE','TIME','MONEY','QUANTITY']:
                #     print()
                #     print(ent_type, ' '.join(amr.tokens[convert_ids[i]] for i in span))
                #     print()

        lemmas = ['' for _ in amr.tokens]
        pos = ['' for _ in amr.tokens]
        for i in stanza_lemmas:
            lemmas[convert_ids[i]] += stanza_lemmas[i]
            pos[convert_ids[i]] = stanza_pos[i]
        for i, l in enumerate(lemmas):
            if not l and i > 0:
                lemmas[i] = lemmas[i - 1]
                pos[i] = pos[i - 1]
        entities = []
        for span in stanza_entity_spans:
            span = [convert_ids[i] for i in span]
            start = min(span)
            end = max(span) + 1
            entities.append((start, end))
        lemmas_json[amr.id] = lemmas
        pos_json[amr.id] = pos
        ner_spans[amr.id] = entities

        # get MWE spans
        mwe_spans[amr.id] = []
        taken = []
        for i, token in enumerate(amr.tokens):
            if i in taken: continue
            found = False
            token = token.lower()
            lemma = lemmas[i].lower()
            if token in mwe_types:
                for mwe in mwe_types[token]:
                    size = len(mwe)
                    if i + size - 1 >= len(amr.tokens): continue
                    if all(amr.tokens[i +
                                      idx].lower().replace('@', '') == mwe[idx]
                           for idx in range(size)):
                        span = (i, i + size)
                        mwe_spans[amr.id].append(span)
                        for t in range(span[0], span[-1]):
                            taken.append(t)
                        found = True
                        break
            if found: continue
            if lemma in mwe_types:
                for mwe in mwe_types[lemma]:
                    size = len(mwe)
                    if i + size - 1 >= len(amr.tokens): continue
                    if all(lemmas[i + idx].lower().replace('@', '') == mwe[idx]
                           for idx in range(size)):
                        span = (i, i + size)
                        mwe_spans[amr.id].append(span)
                        for t in range(span[0], span[-1]):
                            taken.append(t)
                        break
            taken.append(i)

        # look for names matching gold amr
        name_spans = []
        if SEE_GOLD_AMR:
            for n in amr.nodes:
                if amr.nodes[n] == 'name':
                    parts = [(int(r[3:]), t) for s, r, t in amr.edges
                             if s == n and r.startswith(':op')]
                    parts = [t for r, t in sorted(parts, key=lambda x: x[0])]
                    label = ' '.join(amr.nodes[t].replace('"', '')
                                     for t in parts)
                    name_type = [
                        s for s, r, t in amr.edges if t == n and r == ':name'
                    ]
                    name_type = amr.nodes[name_type[0]] if name_type else None
                    if parts:
                        for start in range(len(amr.tokens)):
                            span = [
                                t for t in range(start, start + len(parts))
                            ]
                            if span[-1] >= len(amr.tokens): break
                            tokens = [amr.tokens[t] for t in span]
                            token_label = ' '.join(
                                [tok for tok in tokens if tok != '"'])
                            if token_label.lower() == label.lower():
                                next_tok = span[-1] + 1
                                if next_tok < len(amr.tokens) and amr.tokens[
                                        next_tok] == name_type:
                                    span += [next_tok]
                                if len(parts) > 1:
                                    name_spans.append((span[0], span[-1] + 1))
                                start = span[0]
                                end = span[-1] + 1
                                for span in ner_spans[amr.id][:]:
                                    if span[0] <= start < span[1] and span[
                                            0] < end <= span[1] and (
                                                start, end) != span:
                                        ner_spans[amr.id].remove(span)
                                        break
                                break
            for t in range(len(amr.tokens)):
                if t + 2 < len(amr.tokens) and amr.tokens[t + 1] == '@-@':
                    label1 = f'{lemmas[t]}{lemmas[t + 2]}'.lower(
                    )[:len(lemmas[t]) + 4]
                    label2 = f'{lemmas[t]}-{lemmas[t + 2]}'.lower(
                    )[:len(lemmas[t]) + 5]
                    if any(amr.nodes[n].startswith(label1)
                           or amr.nodes[n].startswith(label2)
                           for n in amr.nodes):
                        name_spans.append((t, t + 3))
        # times
        taken = set()
        for t in range(len(amr.tokens)):
            if t in taken: continue
            start = t
            if amr.tokens[t].isdigit() and len(
                    amr.tokens[t]) <= 2 and t + 2 < len(amr.tokens):
                if amr.tokens[t + 1] in [
                        '@:@', ':'
                ] and amr.tokens[t + 2].isdigit() and len(
                        amr.tokens[t + 2]) == 2:
                    end = t + 2
                    while end + 1 < len(amr.tokens) \
                            and (amr.tokens[end + 1] in ['am', 'pm', 'a.m.', 'p.m.', '@:@', ':', 'UTC', 'GMT', 'EST',
                                                         'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
                                                         'Friday', 'Saturday', ]
                                 or (amr.tokens[end] in ['@:@', ':'] and amr.tokens[end + 1].isdigit() and len(
                                amr.tokens[end + 1]) == 2)):
                        end += 1
                    end += 1
                    time = ' '.join(amr.tokens[t] for t in range(start, end))
                    name_spans.append((start, end))
                    for span in ner_spans[amr.id]:
                        if start < span[1] < end and span[0] < start:
                            name_spans[-1] = (span[0], end)
                            break
                        elif start < span[0] < end and span[1] > end:
                            name_spans[-1] = (start, span[1])
                            break
                        elif span[0] <= start < span[1] and span[
                                0] < end <= span[1]:
                            name_spans[-1] = span
                            break
                    start, end = name_spans[-1]
                    for i in range(start, end):
                        taken.add(i)
        multi_word_spans[amr.id] = []
        taken = set()
        for i, tok in enumerate(amr.tokens):
            if i in taken: continue
            if any(i == span[0] for span in name_spans):
                span = [s for s in name_spans if s[0] <= i < s[1]][0]
                span = [i for i in range(span[0], span[1])]
                multi_word_spans[amr.id].append(span)
                taken.update(span)
            elif any(i == span[0] for span in ner_spans[amr.id]):
                span = [s for s in ner_spans[amr.id] if s[0] <= i < s[1]][0]
                span = [i for i in range(span[0], span[1])]
                multi_word_spans[amr.id].append(span)
                taken.update(span)
            elif any(i == span[0] for span in mwe_spans[amr.id]):
                span = [s for s in mwe_spans[amr.id] if s[0] <= i < s[1]][0]
                span = [i for i in range(span[0], span[1])]
                multi_word_spans[amr.id].append(span)
                taken.update(span)
            else:
                multi_word_spans[amr.id].append([i])
                taken.add(i)
        if coref_parser is not None:
            corefs = get_corefs(amr, coref_parser)
            coreferences[amr.id] = corefs

    # ner_spans = {k: v for k, v in ner_spans.items() if v}
    # mwe_spans = {k: v for k, v in mwe_spans.items() if v}

    filename = amr_file.replace('.txt', '')
    with open(filename + '.lemmas.json', 'w+', encoding='utf8') as f:
        json.dump(lemmas_json, f)
    with open(filename + '.pos.json', 'w+', encoding='utf8') as f:
        json.dump(pos_json, f)
    with open(filename + '.spans.json', 'w+', encoding='utf8') as f:
        json.dump(multi_word_spans, f)
    if coreferences:
        with open(filename + '.coref.json', 'w+', encoding='utf8') as f:
            json.dump(coreferences, f)
예제 #24
0
def main():
    amr_file = sys.argv[1]
    hand_alignments_file = sys.argv[2]

    reader = AMR_Reader()
    amrs = reader.load(amr_file, remove_wiki=True)
    amrs = {amr.id: amr for amr in amrs}

    subgraph_alignments = {}
    relation_alignments = {}
    reentrancy_alignments = {}
    all_spans = {amr_id: set() for amr_id in amrs}

    amr = None
    node_labels = {}
    with open(hand_alignments_file) as f:
        hand_alignments = csv.reader(f, delimiter="\t")
        for row in hand_alignments:
            if row[0] == 'amr':
                amr_id = row[1]
                subgraph_alignments[amr_id] = []
                relation_alignments[amr_id] = []
                reentrancy_alignments[amr_id] = []
                amr = amrs[amr_id]
                taken = set()
                node_labels = get_node_labels(amr)
                node_labels = {v: k for k, v in node_labels.items()}
                edge_labels = get_edge_labels(amr)
                edge_labels = {v: k for k, v in edge_labels.items()}
            elif row[0] == 'node':
                type = 'subgraph'
                if row[3].startswith('*'):
                    type = 'dupl-subgraph'
                    row[3] = row[3].replace('*', '')
                if not row[3]:
                    raise Exception('Missing Annotation:', amr_id)
                node_id = row[1]
                if node_id not in node_labels:
                    raise Exception('Failed to parse node labels:', amr.id,
                                    node_id)
                n = node_labels[node_id]
                token_ids = [int(t) for t in row[3].split(',')]
                if any(t >= len(amr.tokens) for t in token_ids):
                    raise Exception('Bad Annotation:', amr_id)
                if tuple(token_ids) not in all_spans[amr_id] and any(
                        t in taken for t in token_ids):
                    raise Exception('Bad Span Annotation', amr_id)
                all_spans[amr_id].add(tuple(token_ids))
                taken.update(token_ids)
                align = amr.get_alignment(subgraph_alignments,
                                          token_id=token_ids[0])
                if align and align.type == type:
                    align.nodes.append(n)
                else:
                    new_align = AMR_Alignment(type=type,
                                              tokens=token_ids,
                                              nodes=[n],
                                              amr=amr)
                    subgraph_alignments[amr.id].append(new_align)
            elif row[0] == 'edge':
                type = 'relation'
                if row[3].startswith('*'):
                    row[3] = row[3].replace('*', '')
                if not row[3]:
                    raise Exception('Missing Annotation:', amr_id)
                edge_id = row[1]
                if edge_id not in edge_labels:
                    raise Exception('Failed to parse edge labels:', amr.id,
                                    node_id)
                e = edge_labels[edge_id]
                token_ids = [int(t) for t in row[3].split(',')]
                if any(t >= len(amr.tokens) for t in token_ids):
                    raise Exception('Bad Annotation:', amr_id)
                if tuple(token_ids) not in all_spans[amr_id] and any(
                        t in taken for t in token_ids):
                    raise Exception('Bad Span Annotation', amr_id, token_ids)
                all_spans[amr_id].add(tuple(token_ids))
                taken.update(token_ids)
                align = amr.get_alignment(relation_alignments,
                                          token_id=token_ids[0])
                if align and align.type == type:
                    align.edges.append(e)
                else:
                    new_align = AMR_Alignment(type=type,
                                              tokens=token_ids,
                                              edges=[e],
                                              amr=amr)
                    relation_alignments[amr.id].append(new_align)
            elif row[0] == 'reentrancy':
                if not row[3]:
                    raise Exception('Missing Annotation:', amr_id)
                edge_id = row[1]
                e = edge_labels[edge_id]
                if row[3].startswith('*'):
                    row[3] = row[3].replace('*', '')
                if row[3] == '_':
                    token_ids = amr.get_alignment(relation_alignments,
                                                  edge=e).tokens
                else:
                    token_ids = [int(t) for t in row[3].split(',')]
                tag = row[4]
                if row[3] == '_':
                    tag = 'primary'
                if not tag:
                    raise Exception('Missing reentrancy tag:', amr.id)
                type = f'reentrancy:{tag}'
                if any(t >= len(amr.tokens) for t in token_ids):
                    raise Exception('Bad Annotation:', amr_id)
                if tuple(token_ids) not in all_spans[amr_id] and any(
                        t in taken for t in token_ids):
                    raise Exception('Bad Span Annotation', amr_id, token_ids)
                all_spans[amr_id].add(tuple(token_ids))
                taken.update(token_ids)
                new_align = AMR_Alignment(type=type,
                                          tokens=token_ids,
                                          edges=[e],
                                          amr=amr)
                reentrancy_alignments[amr.id].append(new_align)
    for amr_id in subgraph_alignments:
        amr = amrs[amr_id]
        for t in range(len(amr.tokens)):
            if not any(t in span for span in all_spans[amr_id]):
                all_spans[amr_id].add((t, ))
        spans = [
            list(span)
            for span in sorted(all_spans[amr_id], key=lambda x: x[0])
        ]

        for align in subgraph_alignments[amr_id]:
            if align.nodes and not is_subgraph(amr, align.nodes):
                print('Possible Bad align:',
                      amr.id,
                      align.tokens,
                      ' '.join(amr.tokens[t] for t in align.tokens),
                      file=sys.stderr)
        for align in relation_alignments[amr_id]:
            subgraph_aligns = [
                a for a in subgraph_alignments[amr.id]
                if a.tokens == align.tokens
            ]
            for s, r, t in align.edges:
                if subgraph_aligns and not any(
                        s in a.nodes or t in a.nodes or not a.nodes
                        for a in subgraph_aligns):
                    if r == ':manner' and amr.tokens[
                            align.tokens[0]] == 'without':
                        continue
                    raise Exception('Bad Relation align:', amr.id,
                                    align.tokens, s, r, t)
        dupl_sub_aligns = [
            align for align in subgraph_alignments[amr_id]
            if align.type.startswith('dupl')
        ]
        subgraph_alignments[amr_id] = [
            align for align in subgraph_alignments[amr_id]
            if not align.type.startswith('dupl')
        ]
        # dupl_rel_aligns = [align for align in relation_alignments[amr_id] if align.type.startswith('dupl')]
        # relation_alignments[amr_id] = [align for align in relation_alignments[amr_id] if not align.type.startswith('dupl')]
        clean_alignments(amr, subgraph_alignments, dupl_sub_aligns, spans)
        clean_alignments(amr, relation_alignments, [], spans, mode='relations')
        for t, _ in enumerate(amr.tokens):
            count = [span for span in spans if t in span]
            if len(count) != 1:
                raise Exception('Bad Span:', amr.id, count)

    # amr_file = amr_file.replace('.txt', '.jakob')
    align_file = amr_file.replace('.txt',
                                  '') + f'.subgraph_alignments.gold.json'
    print(f'Writing subgraph alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, subgraph_alignments)

    align_file = amr_file.replace('.txt',
                                  '') + f'.relation_alignments.gold.json'
    print(f'Writing relation alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, relation_alignments)

    align_file = amr_file.replace('.txt',
                                  '') + f'.reentrancy_alignments.gold.json'
    print(f'Writing reentrancy alignments to: {align_file}')
    reader.save_alignments_to_json(align_file, reentrancy_alignments)
예제 #25
0
def main():
    dir = '../data/tamr'
    szubert_amrs = '../data/szubert/szubert_amrs.txt'
    output = '../data/szubert/szubert_amrs.tamr.subgraph_alignments.json'

    file2 = '../data/tamr/ldc_train_2017.txt'

    reader = AMR_Reader()
    amrs = reader.load(szubert_amrs, remove_wiki=True)
    amrs2 = reader.load(file2, remove_wiki=True)

    alignments = {}
    for filename in os.listdir(dir):
        if filename.endswith(".tamr_alignment"):
            file = os.path.join(dir, filename)
            amr_id = ''
            with open(file) as f:
                for line in f:
                    if line.startswith('# ::alignments'):
                        aligns = line[len('# ::alignments '):].split()
                        aligns = [s.split('|') for s in aligns if '|' in s]
                        aligns = [(a[0], a[1].split('+')) for a in aligns]
                        for span, nodes in aligns:
                            start = int(span.split('-')[0])
                            end = int(span.split('-')[1])
                            span = [t for t in range(start, end)]
                            align = AMR_Alignment(type='subgraph',
                                                  tokens=span,
                                                  nodes=nodes)
                            alignments[amr_id].append(align)

                    elif line.strip():
                        amr_id = line.strip()
                        alignments[amr_id] = []

    amrs2 = {amr.id: amr for amr in amrs2}
    amrs = [amr for amr in amrs if amr.id in alignments and amr.id in amrs2]
    amrs3 = []
    for amr in amrs[:]:
        amr2 = amrs2[amr.id]
        nodes = {amr.nodes[n] for n in amr.nodes}
        nodes2 = {amr2.nodes[n] for n in amr2.nodes}
        edges = {(amr.nodes[s], r, amr.nodes[t]) for s, r, t in amr.edges}
        edges2 = {(amr2.nodes[s], r, amr2.nodes[t]) for s, r, t in amr2.edges}
        if nodes == nodes2 and edges == edges2:
            amrs3.append(amr)

    amr_ids = [amr.id for amr in amrs]
    alignments = {
        amr_id: alignments[amr_id]
        for amr_id in alignments if amr_id in amr_ids
    }
    for amr in amrs:
        node_map = {}
        nodes = [n for align in alignments[amr.id] for n in align.nodes]
        nodes = [n for n in sorted(nodes, key=lambda x: (len(x), x))]
        for n in nodes:
            prefix = '.'.join(i for i in n.split('.')[:-1])
            last = int(n.split('.')[-1])
            if prefix:
                if prefix not in node_map:
                    new_prefix = '.'.join(
                        str(int(i) + 1) for i in n.split('.')[:-1])
                    if new_prefix not in amr.nodes:
                        continue
                    node_map[prefix] = new_prefix
                new_n = node_map[prefix] + '.' + str(last + 1)
            else:
                new_n = str(last + 1)
            if new_n in amr.nodes:
                node_map[n] = new_n
        nodes = [
            n for align in alignments[amr.id] for n in align.nodes
            if n not in node_map
        ]
        nodes = [n for n in sorted(nodes, key=lambda x: (len(x), x))]
        for n in nodes:
            prefix = '.'.join(i for i in n.split('.')[:-1])
            if prefix not in node_map:
                new_prefix = '.'.join(
                    str(int(i) + 1) for i in n.split('.')[:-1])
                if new_prefix in amr.nodes:
                    node_map[prefix] = new_prefix
                else:
                    del alignments[amr.id]
                    break
            candidates = [t for s, r, t in amr.edges if s == node_map[prefix]]
            candidates = [t for t in candidates if t not in node_map.values()]
            candidates = [t for t in sorted(candidates)]
            if not candidates:
                del alignments[amr.id]
                break
            new_n = candidates[0]
            node_map[n] = new_n
        if amr.id in alignments:
            for align in alignments[amr.id]:
                align.nodes = [node_map[n] for n in align.nodes]
                align.amr = amr
            for t, tok in enumerate(amr.tokens):
                align = amr.get_alignment(alignments, token_id=t)
                if not align:
                    align = AMR_Alignment(type='subgraph',
                                          tokens=[t],
                                          nodes=[],
                                          amr=amr)
                    alignments[amr.id].append(align)
            alignments[amr.id] = [
                align for align in sorted(alignments[amr.id],
                                          key=lambda a: a.tokens[0])
            ]

    reader.save_alignments_to_json(output, alignments)
예제 #26
0
파일: build_data.py 프로젝트: ablodge/leamr
def main():
    ldc_amrs_dir = sys.argv[1]
    lpp_amrs_file = 'data-release/amrs/little_prince.txt'
    additional_amrs_file = 'data-release/amrs/additional_amrs.txt'

    dev_ids = 'data-release/leamr_dev_ids.txt'
    with open(dev_ids) as f:
        dev_ids = [
            line.strip() for line in f if not line.strip().startswith('#')
        ]
    test_ids = 'data-release/leamr_test_ids.txt'
    with open(test_ids) as f:
        test_ids = [
            line.strip() for line in f if not line.strip().startswith('#')
        ]

    ldc_amrs_dir_train = os.path.join(ldc_amrs_dir, 'data/alignments/split',
                                      'training')
    ldc_amrs_dir_dev = os.path.join(ldc_amrs_dir, 'data/alignments/split',
                                    'dev')
    ldc_amrs_dir_test = os.path.join(ldc_amrs_dir, 'data/alignments/split',
                                     'test')

    reader = AMR_Reader()
    ldc_amrs_train = reader.load_from_dir(ldc_amrs_dir_train)
    ldc_amrs_dev = reader.load_from_dir(ldc_amrs_dir_dev)
    ldc_amrs_test = reader.load_from_dir(ldc_amrs_dir_test)
    lpp_amrs = reader.load(lpp_amrs_file)
    add_amrs = reader.load(additional_amrs_file)

    all_amrs = ldc_amrs_train + ldc_amrs_dev + ldc_amrs_test + lpp_amrs + add_amrs
    all_amrs = {amr.id: amr for amr in all_amrs}
    dev_amrs = [all_amrs[amr_id] for amr_id in dev_ids]
    test_amrs = [all_amrs[amr_id] for amr_id in test_ids]

    print()

    output_file = 'data-release/amrs/ldc_train.txt'
    print('Writing LDC training AMRs to:', output_file)
    reader.write_to_file(output_file, ldc_amrs_train)

    output_file = 'data-release/amrs/ldc_dev.txt'
    print('Writing LDC development AMRs to:', output_file)
    reader.write_to_file(output_file, ldc_amrs_dev)

    output_file = 'data-release/amrs/ldc_test.txt'
    print('Writing LDC test AMRs to:', output_file)
    reader.write_to_file(output_file, ldc_amrs_test)

    output_file = 'data-release/amrs/leamr_dev.txt'
    print('Writing LEAMR development data to:', output_file)
    reader.write_to_file(output_file, dev_amrs)

    output_file = 'data-release/amrs/leamr_test.txt'
    print('Writing LEAMR test data to:', output_file)
    reader.write_to_file(output_file, test_amrs)

    output_file = 'data-release/amrs/ldc+little_prince.txt'
    print('Writing LDC + Little Prince data to:', output_file)
    all_amrs = ldc_amrs_train + ldc_amrs_dev + ldc_amrs_test + lpp_amrs
    reader.write_to_file(output_file, all_amrs)