def run(args): data = loader.load_treebanks(TREEBANK_PATH) train_data, dev_data, test_data = loader.train_test_split( data, 0.8, 0.1, 0.1) words, embeddings = loader.load_word_embeddings(EMBEDDING_PATH) pcfg = PCFG(train_data) pcfg.train(train_data) pcfg.set_oov(OOV, words, embeddings) if args.generate_output: output = pcfg.generate_output(test_data) if args.evaluation: accs, nb_no_parse = pcfg.predict(test_data[:2]) if args.parse: corpus = [] with open(args.txt_path, 'r') as f: corpus = f.read().split('\n') pcfg.parse_from_txt(corpus)
from pcfg import PCFG import argparse parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="training treebank corpus", type=str) parser.add_argument("--sentences", help="raw token sentences", type=str) parser.add_argument("--outfile", help="name of the output file", type=str) args = parser.parse_args() grammar = PCFG(args.corpus) grammar.parse_corpus() grammar.predict(args.sentences, args.outfile)