Ejemplo n.º 1
0
def load_and_resave_amr_files(glob_patterns, out_fpath):
    if isinstance(glob_patterns, str):
        glob_patterns = [glob_patterns]
    # Find all the files
    fpaths = []
    for pattern in glob_patterns:
        glob_fpaths = glob(pattern)
        assert len(glob_fpaths) > 0  # check for invalid path in the list
        fpaths += glob_fpaths
    graphs = []
    # Load graphs sorted by filename for consistency
    for fpath in sorted(fpaths, key=lambda x: os.path.basename(x)):
        print('Loading', fpath)
        graphs.extend(load_raw_amr(fpath))
    print('Loaded {:,} graphs'.format(len(graphs)))
    # Save the collated data
    print('Saving data to', out_fpath)
    with open(out_fpath, 'w') as f:
        for graph in graphs:
            f.write('%s\n\n' % graph)
    print()
    return graphs
Ejemplo n.º 2
0
#!/usr/bin/python3
import setup_run_dir  # Set the working directory and python sys.path to 2 levels above
import os
from amrlib.graph_processing.amr_loading_raw import load_raw_amr

if __name__ == '__main__':
    base_dir = 'amrlib/data/amr_annotation_3.0/data/amrs/split'
    out_dir = 'amrlib/data/LDC2020T02'

    os.makedirs(out_dir, exist_ok=True)

    # Loop through the dirctories
    for dirname in ('dev', 'test', 'training'):
        entries = []
        dn = os.path.join(base_dir, dirname)
        print('Loading data from', dn)
        fpaths = [os.path.join(dn, fn) for fn in os.listdir(dn)]
        for fpath in fpaths:
            entries += load_raw_amr(fpath)
        print('Loaded {:,} entries'.format(len(entries)))
        # Save the collated data
        fn = 'train.txt' if dirname == 'training' else dirname + '.txt'
        out_path = os.path.join(out_dir, fn)
        print('Saving data to', out_path)
        with open(out_path, 'w') as f:
            for entry in entries:
                f.write('%s\n\n' % entry)
        print()
Ejemplo n.º 3
0
    os.makedirs(out_dir, exist_ok=True)

    # Get all the amr files and put dev-consensus.txt on top, followed by test-consensus.txt
    # to make scoring easier
    fpaths = [
        y for x in os.walk(amr_dir) for y in glob(os.path.join(x[0], '*.txt'))
    ]
    fpaths = sorted([fp for fp in fpaths if fp not in (dev_fp, test_fp)])
    fpaths = [dev_fp, test_fp] + fpaths

    # Load all the entries
    print('Loading data')
    sents, gstrings = [], []
    for fpath in fpaths:
        amr_strings = load_raw_amr(fpath)
        entries = get_graph_sent(amr_strings)
        #entries = load_amr_graph_sent(fpath)
        # Append the data
        # Filter "(a / amr-empty)" in amr-release-1.0-proxy.txt that might be causing issues
        # So long as this is above the dev/test data (ends at index 200) it won't mess-up scoring
        for sent, graph in zip(entries['sent'], entries['graph']):
            if sent == '.':
                print('Removed empty entry at index %d from %s' %
                      (len(sents), fpath))
                assert len(sents) > 200  # this will mess-up scoring
                continue
            sents.append(sent)
            gstrings.append(graph)
            if max_entries and len(gstrings) >= max_entries:
                break
Ejemplo n.º 4
0
#!/usr/bin/python3
import setup_run_dir  # Set the working directory and python sys.path to 2 levels above
import os
from glob import glob
from amrlib.graph_processing.amr_loading_raw import load_raw_amr

# Collect all the amr graphs from multiple files and create a gold test file.
# This simply concatenates files and cleans a few bad characters out.  The glob pattern
# needs to be exactly the same as what's in generate so the output graph ordering is the same.
if __name__ == '__main__':
    glob_pattern = 'amrlib/data/amr_annotation_3.0/data/amrs/split/test/*.txt'
    out_fpath = 'amrlib/data/model_parse_spring/test-gold.txt.wiki'

    # Load the data
    graphs = []
    print('Loading data from', glob_pattern)
    for fpath in sorted(glob(glob_pattern)):
        graphs.extend(load_raw_amr(fpath))
    print('Loaded {:,} graphs'.format(len(graphs)))

    # Save the collated data
    print('Saving data to', out_fpath)
    with open(out_fpath, 'w') as f:
        for graph in graphs:
            f.write('%s\n\n' % graph)
    print()