def main(): setup_logging(level=WARN) silence_penman() # Open the config file with open(config_fn) as f: config = json.load(f) # Modify model paths to be absolute, relative to this file config['gtos_model_dir'] = os.path.realpath( os.path.join(base_dir, config['gtos_model_dir'])) config['stog_model_dir'] = os.path.realpath( os.path.join(base_dir, config['stog_model_dir'])) # For debug print('AMRView Config') for k, v in config.items(): print('%s = %s' % (k, v)) app = QApplication([]) window = MainWindow(config) app.exec_()
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import os from amrlib.utils.logging import silence_penman, setup_logging, INFO, WARN from amrlib.graph_processing.wiki_adder import WikiAdder if __name__ == '__main__': silence_penman() setup_logging(logfname='./logs/spotlight_wiki_add.log', level=WARN) url = 'http://localhost:2222/rest/annotate' cache_fn = 'amrlib/data/tdata_gsii/spotlight_wiki.json' infn = 'amrlib/data/model_parse_gsii/epoch200.pt.test_generated' outfn = infn + '.wiki' wiki = WikiAdder(url=url, cache_fn=cache_fn) print('Wikifing', infn) wiki.wikify_file(infn, outfn) print('Data written to', outfn) wiki.save_cache(cache_fn) print('cache saved to', cache_fn) print() print(wiki.get_stat_string()) print()
import os from amrlib.utils.logging import setup_logging, WARN, ERROR from amrlib.graph_processing.amr_loading import load_amr_entries from amrlib.models.generate_t5wtense.inference import Inference # Get the sentence from an AMR graph string def get_sentence(graph): for line in graph.splitlines(): if line.startswith('# ::snt'): return line[len('# :snt') + 1:].strip() assert False, 'Error, no sentence info in graph string' if __name__ == '__main__': setup_logging(logfname='logs/generate_t5wtense.log', level=ERROR) device = 'cuda:0' model_dir = 'amrlib/data/model_generate_t5wtense/' corpus_dir = 'amrlib/data/tdata_generate_t5wtense/' test_fn = 'test.txt.features.nowiki' # standard AMR graphs # Works using GTX TitanX (12GB) # greedy (num_beams=1, batch_size=32) run-time = 4min # (num_beams=8, batch_size=8) run-time = 16min # (num_beams=16, batch_size=4) run-time = 29min batch_size = 4 num_beams = 16 use_tense = True rm_clips = True # Create the filenames based on above parameters extension = '.tagged' if use_tense else '.nowiki'
#!/usr/bin/python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import os from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR from amrlib.evaluate.smatch_enhanced import get_entries, compute_smatch from amrlib.models.parse_xfm.inference import Inference from amrlib.models.parse_xfm.penman_serializer import load_and_serialize if __name__ == '__main__': os.environ['TOKENIZERS_PARALLELISM'] = 'false' setup_logging(logfname='logs/test_model_parse_xfm.log', level=WARN) silence_penman() device = 'cuda:0' corpus_dir = 'amrlib/data/tdata_xfm/' ref_in_fn = 'test.txt.nowiki' # 1898 amr entries model_dir = 'amrlib/data/model_parse_xfm_bart_large-v0_1_0' gold_fpath = os.path.join(model_dir, 'test-gold.txt') pred_fpath = os.path.join(model_dir, 'test-pred.txt') num_beams = 4 # use 4 for formal testing (batch_size=16 for 24GB GPU) batch_size = 16 max_entries = None # max test data to generate (use None for everything) fpath = os.path.join(corpus_dir, ref_in_fn) print('Loading test data', fpath) entries = load_and_serialize(fpath) ref_graphs = entries['graphs'][:max_entries] ref_serials = entries['serials'][:max_entries] ref_sents = entries['sents'][:max_entries] print('Loading model, tokenizer and data') inference = Inference(model_dir,
from multiprocessing import Pool from functools import partial import spacy import penman from penman.models.noop import NoOpModel from amrlib.utils.logging import setup_logging, silence_penman, WARN from amrlib.graph_processing.amr_loading import load_amr_entries from amrlib.graph_processing.annotator import load_spacy, add_lemmas from amrlib.alignments.rbw_aligner import RBWAligner logger = logging.getLogger(__name__) # Run the aligner on the LDC files with existing alignments for comparison # The ISI hand alignments are for LDC2014T12 (AMR1) the test-concensus.txt and dev-concensus.txt files if __name__ == '__main__': setup_logging(level=WARN, logfname='logs/rbw_aligner.log') silence_penman() in_fname = 'amrlib/data/amr_annotation_1.0/data/split/test/amr-release-1.0-test-consensus.txt' out_fname = 'amrlib/data/alignments/test-aligned.txt' # in_fname = 'amrlib/data/amr_annotation_1.0/data/split/dev/amr-release-1.0-dev-consensus.txt' # out_fname = 'amrlib/data/alignments/dev-aligned.txt' # Load and convert to a penman graph print('Loading', in_fname) entries = load_amr_entries(in_fname) print('Loaded %d entries' % len(entries)) # Convert to penman and add lemmas print('Annotating') load_spacy(
#!/usr/bin/python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import logging import penman from amrlib.utils.logging import setup_logging, silence_penman, DEBUG from amrlib.graph_processing.amr_loading import load_amr_entries from amrlib.alignments.rbw_aligner import RBWAligner from amrlib.alignments.penman_utils import strip_surface_alignments logger = logging.getLogger(__name__) if __name__ == '__main__': setup_logging(level=DEBUG, logfname='logs/rbw_aligner_debug.log') silence_penman() fname = 'amrlib/data/alignments/test_realigned.txt' index = 0 entries = load_amr_entries(fname) entry = entries[index] entry = strip_surface_alignments(entry) # Run the aligner aligner = RBWAligner.from_string_w_json(entry) #, align_str_name='rbw_alignments') print(aligner.get_graph_string()) print()
import logging from tqdm import tqdm from amrlib.graph_processing.amr_loading import load_amr_entries from amrlib.utils.logging import setup_logging, silence_penman, WARN from amrlib.models.generate_t5wtense.model_input_helper import ModelInputHelper logger = logging.getLogger(__name__) # Nomenclature # xx.nowiki # standard AMR # xx.nowiki.tagged # pos tags added # xx.nowiki.tdata # the above 2 combined # Take graphs that are annotated (tokens, pos, ...) and align them then tag the graphs. # Save files with the tagged and untagged data together in a single training file if __name__ == '__main__': setup_logging(level=WARN, logfname='logs/create_td_gen_t5wtense.log') silence_penman() data_dir = 'amrlib/data/tdata_generate_t5wtense' base_fns = ('dev.txt', 'test.txt', 'train.txt') # Loop through the files for base_fn in base_fns: infn = os.path.join(data_dir, base_fn + '.features.nowiki') print('Loading and processing', infn) entries = load_amr_entries(infn) tagged_entries = [] for entry in tqdm(entries, ncols=100): tagged_entry = ModelInputHelper(entry).get_tagged_with_meta() tagged_entries.append(tagged_entry) # Save tagged data only to a new file # outfn = infn + '.tagged'
#!/usr/bin/env python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import sys # Add BLINK to python search path if needed (there is no pip install for BLINK) sys.path.append('/home/bjascob/Libraries/BLINK-2021_12_02') import warnings warnings.simplefilter('ignore') # Blink has useless warning import json import penman from amrlib.utils.logging import setup_logging, silence_penman, WARN from amrlib.graph_processing.wiki_adder_blink import WikiAdderBlink if __name__ == '__main__': setup_logging('logs/blink_wikify.log', level=WARN) silence_penman() model_dir = 'amrlib/data/BLINK_Model' infpath = 'amrlib/data/model_parse_spring/test-pred.txt' outfpath = 'amrlib/data/model_parse_spring/test-pred.txt.wiki' # Load the BLINK models wa = WikiAdderBlink(model_dir) wa.wikify_file(infpath, outfpath)
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import warnings warnings.simplefilter('ignore') import os from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR from amrlib.models.parse_t5.inference import Inference from amrlib.models.parse_t5.penman_serializer import load_and_serialize # Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py # This can be changed. The corpus doesn't need to be annotated (you can skip running 10_x) but # wikidata should be removed since the model doesn't produce those tags and these graphs will be # copied as the reference data to be scored in the next step. if __name__ == '__main__': setup_logging(logfname='logs/parse_t5_generate.log', level=ERROR) silence_penman() device = 'cuda:0' corpus_dir = 'amrlib/data/tdata_gsii/' ref_in_fn = 'test.txt.features.nowiki' # 1898 amr entries model_dir = 'amrlib/data/model_parse_t5' ref_out_fn = 'test.txt.reference' gen_out_fn = 'test.txt.generated' # Works using GTX TitanX (12GB) # Note that the more beams, the better chance of getting a correctly deserialized graph # greedy (num_beams=1, batch_size=32) run-time = 12m # (num_beams=4, batch_size=12) run-time = 50m # (num_beams=8, batch_size=6) run-time = 1h20 # (num_beams=16, batch_size=3) run-time = 2h30m num_beams = 4 batch_size = 12 max_entries = None # max test data to generate (use None for everything)
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import os from amrlib.utils.logging import setup_logging, WARN from amrlib.models.parse_gsii.create_vocabs import create_vocabs # Create vocabs from the training data if __name__ == '__main__': setup_logging(logfname='logs/create_vocabs.log', level=WARN) train_data = 'amrlib/data/tdata_gsii/train.txt.features.nowiki' vocab_dir = 'amrlib/data/model_parse_gsii/vocabs' os.makedirs(vocab_dir, exist_ok=True) create_vocabs(train_data, vocab_dir)
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up from amrlib.utils.logging import setup_logging, WARN from amrlib.utils.config import Config from amrlib.models.parse_gsii import trainer from amrlib.utils.log_splitter import LogSplitter # Train th emodel if __name__ == '__main__': setup_logging(logfname='logs/train_gsii.log', level=WARN) args = Config.load('configs/model_parse_gsii.json') ls = LogSplitter('train.log') trainer.run_training(args, ls)
#!/usr/bin/python3 import setup_run_dir # Set the working directory and python sys.path to 2 levels above import json from PyQt5.QtWidgets import * from amrlib.amr_view.main_window import MainWindow from amrlib.utils.logging import silence_penman, setup_logging, INFO, WARN if __name__ == '__main__': setup_logging(level=WARN) silence_penman() with open('amrlib/amr_view/amr_view.json') as f: config = json.load(f) app = QApplication([]) window = MainWindow(config) app.exec_()
lines = f.readlines() clips = [bool(int(l[0])) for l in lines] clips = set([i for i, c in enumerate(clips) if c is True]) gstrings = [l[2:].strip() for l in lines] return clips, gstrings # This code is for debug only # The code does the deserialization of graphs generated without deserialization # # Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py # This can be changed. The corpus doesn't need to be annotated (you can skip running 10_x) but # wikidata should be removed since the model doesn't produce those tags and these graphs will be # copied as the reference data to be scored at the end. if __name__ == '__main__': setup_logging(logfname='logs/post_process.log', level=WARN) corpus_dir = 'amrlib/data/tdata_gsii/' ref_in_fn = 'test.txt.features.nowiki' test_dir = 'amrlib/data/test_parse_t5' gen_in_fn = 'test.txt.generated' ref_out_fn = 'test.txt.reference.post' gen_out_fn = gen_in_fn + '.post' # Load the reference graphs fname = os.path.join(corpus_dir, ref_in_fn) print('Loading', fname) ref_amr_entries = load_amr_entries(fname) ref_in_graphs = [get_graph_only(e) for e in ref_amr_entries] print('Loaded %d reference graphs' % len(ref_in_graphs)) # Load the generated graphs
from amrlib.models.parse_spring.trainer import Trainer from amrlib.utils.logging import setup_logging, silence_penman, WARN # See random generators for consistant results random.seed(0) torch.manual_seed(0) numpy.random.seed(0) # For bart-large # There are ~16068 batches in the training data for batch_size = 500 # On a Titan X (12GB, fp32=6.7 TFlops) training takes 80 minutes/epoch including # about 6 minutes for prediction/smatch testing. if __name__ == '__main__': logging.getLogger('transformers.tokenization_utils_base').setLevel( logging.ERROR) # skip tokenizer warning setup_logging(logfname='logs/train_parse_spring.log', level=WARN) silence_penman() # Paths config_fn = 'configs/model_parse_spring.json' #checkpoint = 'data/model_parse_spring/checkpoint_epoch_08_smatch_8422.pt' checkpoint = None # start from scratch # Load the config file with open(config_fn) as f: config = json.load(f) # Setup the training data locations config[ 'train'] = 'amrlib/data/amr_annotation_3.0/data/amrs/split/training/*.txt' config['dev'] = 'amrlib/data/amr_annotation_3.0/data/amrs/split/dev/*.txt'
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import warnings warnings.simplefilter('ignore') import os import json from amrlib.utils.logging import setup_logging, WARN from amrlib.models.generate_t5.trainer import Trainer if __name__ == '__main__': setup_logging(logfname='logs/train_t5gen.log', level=WARN) config_fn = 'configs/model_generate_t5.json' with open(config_fn) as f: args = json.load(f) trainer = Trainer(args) trainer.train()
import logging import penman from penman.models.noop import NoOpModel from amrlib.graph_processing.amr_loading import load_amr_entries, get_graph_only from amrlib.utils.logging import setup_logging, WARN, DEBUG from amrlib.evaluate.smatch_enhanced import get_entries, compute_smatch from amrlib.models.parse_t5.penman_serializer import PenmanDeSerializer, load_and_serialize logger = logging.getLogger(__name__) # Code to take the reference graphs, serialize them and then deserialize # The only purpose of this code is to test the effectiveness of penman_serializer.py code # Ideally the process would be lossless, giving a SMATCH score of 1.0 if __name__ == '__main__': setup_logging(logfname='logs/serial_deserial.log', level=WARN) corpus_dir = 'amrlib/data/LDC2020T02' in_fn = 'test.txt' out_dir = 'amrlib/data/test_parse_t5' ref_out_fn = in_fn + '.roundtrip_ref' gen_out_fn = in_fn + '.roundtrip_gen' # Make the out directory os.makedirs(out_dir, exist_ok=True) # Load the reference graphs fname = os.path.join(corpus_dir, in_fn) print('Loading', fname) ref_amr_entries = load_amr_entries(fname) ref_in_graphs = [get_graph_only(e) for e in ref_amr_entries] print('Loaded %d reference graphs' % len(ref_in_graphs))
import warnings warnings.simplefilter('ignore') import os from amrlib.utils.logging import silence_penman, setup_logging, WARN, ERROR from amrlib.models.parse_t5.inference import Inference from amrlib.models.parse_t5.penman_serializer import load_and_serialize # This code is for debug only # This will run the model inference (aka generate) but it bypasses the deserialize process so # these graphs can be saved in raw format. This makes testing changes to the deserializer # much easier since generate takes about 40 minutes but deserialization can be done in seconds. # # Note tdata_gsii was created with 30_Model_Parse_GSII/10_Annotate_Corpus.py and 12_RemoveWikiData.py # This can be changed and the raw LDC data could be used. if __name__ == '__main__': setup_logging(logfname='logs/generate_no_deserialize.log', level=ERROR) silence_penman() device = 'cuda:0' corpus_dir = 'amrlib/data/tdata_gsii/' ref_in_fn = 'test.txt.features.nowiki' # 1898 amr entries model_dir = 'amrlib/data/model_parse_t5' save_dir = 'amrlib/data/test_parse_t5' gen_out_fn = 'test.txt.generated' num_beams = 4 batch_size = 12 max_entries = None # max test data to generate (use None for everything) # Make the out directory os.makedirs(save_dir, exist_ok=True) # Load and serialize the reference sentences to parse
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import warnings warnings.simplefilter('ignore') import os import json from amrlib.utils.logging import setup_logging, WARN from amrlib.models.generate_t5wtense.trainer import Trainer if __name__ == '__main__': setup_logging(logfname='logs/train_generate_t5wtense.log', level=WARN) config_fn = 'configs/model_generate_t5wtense.json' with open(config_fn) as f: args = json.load(f) trainer = Trainer(args) trainer.train()
#!/usr/bin/python3 import setup_run_dir # this import tricks script to run from 2 levels up import os from amrlib.utils.logging import setup_logging, WARN from amrlib.models.parse_gsii.inference import Inference if __name__ == '__main__': setup_logging(logfname='logs/generate.log', level=WARN) device = 'cuda:0' model_dir = 'amrlib/data/model_parse_gsii' model_fn = 'epoch200.pt' data_dir = 'amrlib/data/tdata_gsii' test_data = 'test.txt.features.nowiki' out_fn = model_fn + '.test_generated' infer = Inference(model_dir, model_fn, device=device) infer.reparse_annotated_file(data_dir, test_data, model_dir, out_fn)