Beispiel #1
0
def main():
    setup_logging(level=WARN)
    silence_penman()

    # Open the config file
    with open(config_fn) as f:
        config = json.load(f)

    # Modify model paths to be absolute, relative to this file
    config['gtos_model_dir'] = os.path.realpath(
        os.path.join(base_dir, config['gtos_model_dir']))
    config['stog_model_dir'] = os.path.realpath(
        os.path.join(base_dir, config['stog_model_dir']))

    # For debug
    print('AMRView Config')
    for k, v in config.items():
        print('%s = %s' % (k, v))

    app = QApplication([])
    window = MainWindow(config)
    app.exec_()
Beispiel #2
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import os
from amrlib.utils.logging import silence_penman, setup_logging, INFO, WARN
from amrlib.graph_processing.wiki_adder import WikiAdder

if __name__ == '__main__':
    silence_penman()
    setup_logging(logfname='./logs/spotlight_wiki_add.log', level=WARN)
    url = 'http://localhost:2222/rest/annotate'
    cache_fn = 'amrlib/data/tdata_gsii/spotlight_wiki.json'
    infn = 'amrlib/data/model_parse_gsii/epoch200.pt.test_generated'
    outfn = infn + '.wiki'

    wiki = WikiAdder(url=url, cache_fn=cache_fn)
    print('Wikifing', infn)
    wiki.wikify_file(infn, outfn)
    print('Data written to', outfn)
    wiki.save_cache(cache_fn)
    print('cache saved to', cache_fn)
    print()
    print(wiki.get_stat_string())
    print()
Beispiel #3
0
import os
from amrlib.utils.logging import setup_logging, WARN, ERROR
from amrlib.graph_processing.amr_loading import load_amr_entries
from amrlib.models.generate_t5wtense.inference import Inference


# Get the sentence from an AMR graph string
def get_sentence(graph):
    for line in graph.splitlines():
        if line.startswith('# ::snt'):
            return line[len('# :snt') + 1:].strip()
    assert False, 'Error, no sentence info in graph string'


if __name__ == '__main__':
    setup_logging(logfname='logs/generate_t5wtense.log', level=ERROR)
    device = 'cuda:0'
    model_dir = 'amrlib/data/model_generate_t5wtense/'
    corpus_dir = 'amrlib/data/tdata_generate_t5wtense/'
    test_fn = 'test.txt.features.nowiki'  # standard AMR graphs
    # Works using GTX TitanX (12GB)
    # greedy (num_beams=1, batch_size=32) run-time =  4min
    #        (num_beams=8,  batch_size=8) run-time = 16min
    #        (num_beams=16, batch_size=4) run-time = 29min
    batch_size = 4
    num_beams = 16
    use_tense = True
    rm_clips = True

    # Create the filenames based on above parameters
    extension = '.tagged' if use_tense else '.nowiki'
Beispiel #4
0
#!/usr/bin/python3
import setup_run_dir  # Set the working directory and python sys.path to 2 levels above
import os
from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR
from amrlib.evaluate.smatch_enhanced import get_entries, compute_smatch
from amrlib.models.parse_xfm.inference import Inference
from amrlib.models.parse_xfm.penman_serializer import load_and_serialize

if __name__ == '__main__':
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    setup_logging(logfname='logs/test_model_parse_xfm.log', level=WARN)
    silence_penman()
    device = 'cuda:0'
    corpus_dir = 'amrlib/data/tdata_xfm/'
    ref_in_fn = 'test.txt.nowiki'  # 1898 amr entries
    model_dir = 'amrlib/data/model_parse_xfm_bart_large-v0_1_0'
    gold_fpath = os.path.join(model_dir, 'test-gold.txt')
    pred_fpath = os.path.join(model_dir, 'test-pred.txt')
    num_beams = 4  # use 4 for formal testing (batch_size=16 for 24GB GPU)
    batch_size = 16
    max_entries = None  # max test data to generate (use None for everything)

    fpath = os.path.join(corpus_dir, ref_in_fn)
    print('Loading test data', fpath)
    entries = load_and_serialize(fpath)
    ref_graphs = entries['graphs'][:max_entries]
    ref_serials = entries['serials'][:max_entries]
    ref_sents = entries['sents'][:max_entries]

    print('Loading model, tokenizer and data')
    inference = Inference(model_dir,
from multiprocessing import Pool
from functools import partial
import spacy
import penman
from penman.models.noop import NoOpModel
from amrlib.utils.logging import setup_logging, silence_penman, WARN
from amrlib.graph_processing.amr_loading import load_amr_entries
from amrlib.graph_processing.annotator import load_spacy, add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner

logger = logging.getLogger(__name__)

# Run the aligner on the LDC files with existing alignments for comparison
# The ISI hand alignments are for LDC2014T12 (AMR1) the test-concensus.txt and dev-concensus.txt files
if __name__ == '__main__':
    setup_logging(level=WARN, logfname='logs/rbw_aligner.log')
    silence_penman()

    in_fname = 'amrlib/data/amr_annotation_1.0/data/split/test/amr-release-1.0-test-consensus.txt'
    out_fname = 'amrlib/data/alignments/test-aligned.txt'
    # in_fname  = 'amrlib/data/amr_annotation_1.0/data/split/dev/amr-release-1.0-dev-consensus.txt'
    # out_fname = 'amrlib/data/alignments/dev-aligned.txt'

    # Load and convert to a penman graph
    print('Loading', in_fname)
    entries = load_amr_entries(in_fname)
    print('Loaded %d entries' % len(entries))

    # Convert to penman and add lemmas
    print('Annotating')
    load_spacy(
#!/usr/bin/python3
import setup_run_dir    # Set the working directory and python sys.path to 2 levels above
import logging
import penman
from   amrlib.utils.logging import setup_logging, silence_penman, DEBUG
from   amrlib.graph_processing.amr_loading import load_amr_entries
from   amrlib.alignments.rbw_aligner import RBWAligner
from   amrlib.alignments.penman_utils import strip_surface_alignments

logger = logging.getLogger(__name__)


if __name__ == '__main__':
    setup_logging(level=DEBUG, logfname='logs/rbw_aligner_debug.log')
    silence_penman()

    fname = 'amrlib/data/alignments/test_realigned.txt'
    index = 0

    entries = load_amr_entries(fname)
    entry   = entries[index]
    entry   = strip_surface_alignments(entry)

    # Run the aligner
    aligner = RBWAligner.from_string_w_json(entry) #, align_str_name='rbw_alignments')
    print(aligner.get_graph_string())
    print()
Beispiel #7
0
import logging
from tqdm import tqdm
from amrlib.graph_processing.amr_loading import load_amr_entries
from amrlib.utils.logging import setup_logging, silence_penman, WARN
from amrlib.models.generate_t5wtense.model_input_helper import ModelInputHelper

logger = logging.getLogger(__name__)

# Nomenclature
# xx.nowiki           # standard AMR
# xx.nowiki.tagged    # pos tags added
# xx.nowiki.tdata     # the above 2 combined
# Take graphs that are annotated (tokens, pos, ...) and align them then tag the graphs.
# Save files with the tagged and untagged data together in a single training file
if __name__ == '__main__':
    setup_logging(level=WARN, logfname='logs/create_td_gen_t5wtense.log')
    silence_penman()
    data_dir = 'amrlib/data/tdata_generate_t5wtense'
    base_fns = ('dev.txt', 'test.txt', 'train.txt')

    # Loop through the files
    for base_fn in base_fns:
        infn = os.path.join(data_dir, base_fn + '.features.nowiki')
        print('Loading and processing', infn)
        entries = load_amr_entries(infn)
        tagged_entries = []
        for entry in tqdm(entries, ncols=100):
            tagged_entry = ModelInputHelper(entry).get_tagged_with_meta()
            tagged_entries.append(tagged_entry)
        # Save tagged data only to a new file
        # outfn = infn + '.tagged'
#!/usr/bin/env python3
import setup_run_dir  # Set the working directory and python sys.path to 2 levels above
import sys
# Add BLINK to python search path if needed (there is no pip install for BLINK)
sys.path.append('/home/bjascob/Libraries/BLINK-2021_12_02')
import warnings
warnings.simplefilter('ignore')  # Blink has useless warning
import json
import penman
from amrlib.utils.logging import setup_logging, silence_penman, WARN
from amrlib.graph_processing.wiki_adder_blink import WikiAdderBlink

if __name__ == '__main__':
    setup_logging('logs/blink_wikify.log', level=WARN)
    silence_penman()

    model_dir = 'amrlib/data/BLINK_Model'
    infpath = 'amrlib/data/model_parse_spring/test-pred.txt'
    outfpath = 'amrlib/data/model_parse_spring/test-pred.txt.wiki'

    # Load the BLINK models
    wa = WikiAdderBlink(model_dir)
    wa.wikify_file(infpath, outfpath)
Beispiel #9
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import warnings
warnings.simplefilter('ignore')
import os
from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR
from amrlib.models.parse_t5.inference import Inference
from amrlib.models.parse_t5.penman_serializer import load_and_serialize

# Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py
# This can be changed.  The corpus doesn't need to be annotated (you can skip running 10_x) but
# wikidata should be removed since the model doesn't produce those tags and these graphs will be
# copied as the reference data to be scored in the next step.
if __name__ == '__main__':
    setup_logging(logfname='logs/parse_t5_generate.log', level=ERROR)
    silence_penman()
    device = 'cuda:0'
    corpus_dir = 'amrlib/data/tdata_gsii/'
    ref_in_fn = 'test.txt.features.nowiki'  # 1898 amr entries
    model_dir = 'amrlib/data/model_parse_t5'
    ref_out_fn = 'test.txt.reference'
    gen_out_fn = 'test.txt.generated'
    # Works using GTX TitanX (12GB)
    # Note that the more beams, the better chance of getting a correctly deserialized graph
    # greedy (num_beams=1, batch_size=32) run-time =  12m
    #        (num_beams=4, batch_size=12) run-time =  50m
    #        (num_beams=8,  batch_size=6) run-time = 1h20
    #        (num_beams=16, batch_size=3) run-time = 2h30m
    num_beams = 4
    batch_size = 12
    max_entries = None  # max test data to generate (use None for everything)
Beispiel #10
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import os
from amrlib.utils.logging import setup_logging, WARN
from amrlib.models.parse_gsii.create_vocabs import create_vocabs

# Create vocabs from the training data
if __name__ == '__main__':
    setup_logging(logfname='logs/create_vocabs.log', level=WARN)
    train_data = 'amrlib/data/tdata_gsii/train.txt.features.nowiki'
    vocab_dir = 'amrlib/data/model_parse_gsii/vocabs'

    os.makedirs(vocab_dir, exist_ok=True)

    create_vocabs(train_data, vocab_dir)
Beispiel #11
0
#!/usr/bin/python3
import setup_run_dir    # this import tricks script to run from 2 levels up
from amrlib.utils.logging import setup_logging, WARN
from amrlib.utils.config import Config
from amrlib.models.parse_gsii import trainer
from amrlib.utils.log_splitter import LogSplitter


# Train th emodel
if __name__ == '__main__':
    setup_logging(logfname='logs/train_gsii.log', level=WARN)
    args = Config.load('configs/model_parse_gsii.json')
    ls = LogSplitter('train.log')
    trainer.run_training(args, ls)
Beispiel #12
0
#!/usr/bin/python3
import setup_run_dir    # Set the working directory and python sys.path to 2 levels above
import json
from   PyQt5.QtWidgets import *
from   amrlib.amr_view.main_window import MainWindow
from   amrlib.utils.logging import silence_penman, setup_logging, INFO, WARN


if __name__ == '__main__':
    setup_logging(level=WARN)
    silence_penman()

    with open('amrlib/amr_view/amr_view.json') as f:
        config = json.load(f)

    app = QApplication([])
    window = MainWindow(config)
    app.exec_()
Beispiel #13
0
        lines = f.readlines()
    clips = [bool(int(l[0])) for l in lines]
    clips = set([i for i, c in enumerate(clips) if c is True])
    gstrings = [l[2:].strip() for l in lines]
    return clips, gstrings


# This code is for debug only
# The code does the deserialization of graphs generated without deserialization
#
# Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py
# This can be changed.  The corpus doesn't need to be annotated (you can skip running 10_x) but
# wikidata should be removed since the model doesn't produce those tags and these graphs will be
# copied as the reference data to be scored at the end.
if __name__ == '__main__':
    setup_logging(logfname='logs/post_process.log', level=WARN)
    corpus_dir = 'amrlib/data/tdata_gsii/'
    ref_in_fn = 'test.txt.features.nowiki'
    test_dir = 'amrlib/data/test_parse_t5'
    gen_in_fn = 'test.txt.generated'
    ref_out_fn = 'test.txt.reference.post'
    gen_out_fn = gen_in_fn + '.post'

    # Load the reference graphs
    fname = os.path.join(corpus_dir, ref_in_fn)
    print('Loading', fname)
    ref_amr_entries = load_amr_entries(fname)
    ref_in_graphs = [get_graph_only(e) for e in ref_amr_entries]
    print('Loaded %d reference graphs' % len(ref_in_graphs))

    # Load the generated graphs
Beispiel #14
0
from amrlib.models.parse_spring.trainer import Trainer
from amrlib.utils.logging import setup_logging, silence_penman, WARN

# See random generators for consistant results
random.seed(0)
torch.manual_seed(0)
numpy.random.seed(0)

# For bart-large
#   There are ~16068 batches in the training data for batch_size = 500
#   On a Titan X (12GB, fp32=6.7 TFlops) training takes 80 minutes/epoch including
#   about  6 minutes for prediction/smatch testing.
if __name__ == '__main__':
    logging.getLogger('transformers.tokenization_utils_base').setLevel(
        logging.ERROR)  # skip tokenizer warning
    setup_logging(logfname='logs/train_parse_spring.log', level=WARN)
    silence_penman()

    # Paths
    config_fn = 'configs/model_parse_spring.json'
    #checkpoint = 'data/model_parse_spring/checkpoint_epoch_08_smatch_8422.pt'
    checkpoint = None  # start from scratch

    # Load the config file
    with open(config_fn) as f:
        config = json.load(f)

    # Setup the training data locations
    config[
        'train'] = 'amrlib/data/amr_annotation_3.0/data/amrs/split/training/*.txt'
    config['dev'] = 'amrlib/data/amr_annotation_3.0/data/amrs/split/dev/*.txt'
Beispiel #15
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import warnings
warnings.simplefilter('ignore')
import os
import json
from amrlib.utils.logging import setup_logging, WARN
from amrlib.models.generate_t5.trainer import Trainer

if __name__ == '__main__':
    setup_logging(logfname='logs/train_t5gen.log', level=WARN)
    config_fn = 'configs/model_generate_t5.json'

    with open(config_fn) as f:
        args = json.load(f)
    trainer = Trainer(args)
    trainer.train()
import logging
import penman
from   penman.models.noop import NoOpModel
from   amrlib.graph_processing.amr_loading import load_amr_entries, get_graph_only
from   amrlib.utils.logging import setup_logging, WARN, DEBUG
from   amrlib.evaluate.smatch_enhanced import get_entries, compute_smatch
from   amrlib.models.parse_t5.penman_serializer import PenmanDeSerializer, load_and_serialize  

logger = logging.getLogger(__name__)


# Code to take the reference graphs, serialize them and then deserialize
# The only purpose of this code is to test the effectiveness of penman_serializer.py code
# Ideally the process would be lossless, giving a SMATCH score of 1.0
if __name__ == '__main__':
    setup_logging(logfname='logs/serial_deserial.log', level=WARN)
    corpus_dir = 'amrlib/data/LDC2020T02'
    in_fn      = 'test.txt'
    out_dir    = 'amrlib/data/test_parse_t5'
    ref_out_fn = in_fn + '.roundtrip_ref'
    gen_out_fn = in_fn + '.roundtrip_gen'

    # Make the out directory
    os.makedirs(out_dir, exist_ok=True)

    # Load the reference graphs
    fname = os.path.join(corpus_dir, in_fn)
    print('Loading', fname)
    ref_amr_entries = load_amr_entries(fname)
    ref_in_graphs   = [get_graph_only(e) for e in ref_amr_entries]
    print('Loaded %d reference graphs' % len(ref_in_graphs))
Beispiel #17
0
import warnings
warnings.simplefilter('ignore')
import os
from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR
from amrlib.models.parse_t5.inference import Inference
from amrlib.models.parse_t5.penman_serializer import load_and_serialize

# This code is for debug only
# This will run the model inference (aka generate) but it bypasses the deserialize process so
# these graphs can be saved in raw format.  This makes testing changes to the deserializer
# much easier since generate takes about 40 minutes but deserialization can be done in seconds.
#
# Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py
# This can be changed and the raw LDC data could be used.
if __name__ == '__main__':
    setup_logging(logfname='logs/generate_no_deserialize.log', level=ERROR)
    silence_penman()
    device = 'cuda:0'
    corpus_dir = 'amrlib/data/tdata_gsii/'
    ref_in_fn = 'test.txt.features.nowiki'  # 1898 amr entries
    model_dir = 'amrlib/data/model_parse_t5'
    save_dir = 'amrlib/data/test_parse_t5'
    gen_out_fn = 'test.txt.generated'
    num_beams = 4
    batch_size = 12
    max_entries = None  # max test data to generate (use None for everything)

    # Make the out directory
    os.makedirs(save_dir, exist_ok=True)

    # Load and serialize the reference sentences to parse
Beispiel #18
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import warnings
warnings.simplefilter('ignore')
import os
import json
from amrlib.utils.logging import setup_logging, WARN
from amrlib.models.generate_t5wtense.trainer import Trainer

if __name__ == '__main__':
    setup_logging(logfname='logs/train_generate_t5wtense.log', level=WARN)
    config_fn = 'configs/model_generate_t5wtense.json'

    with open(config_fn) as f:
        args = json.load(f)
    trainer = Trainer(args)
    trainer.train()
Beispiel #19
0
#!/usr/bin/python3
import setup_run_dir  # this import tricks script to run from 2 levels up
import os
from amrlib.utils.logging import setup_logging, WARN
from amrlib.models.parse_gsii.inference import Inference

if __name__ == '__main__':
    setup_logging(logfname='logs/generate.log', level=WARN)
    device = 'cuda:0'
    model_dir = 'amrlib/data/model_parse_gsii'
    model_fn = 'epoch200.pt'
    data_dir = 'amrlib/data/tdata_gsii'
    test_data = 'test.txt.features.nowiki'
    out_fn = model_fn + '.test_generated'

    infer = Inference(model_dir, model_fn, device=device)
    infer.reparse_annotated_file(data_dir, test_data, model_dir, out_fn)