def setUp(self): unittest.TestCase.setUp(self) self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)") self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)") self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))") self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))') self.extExpr=ExtendedExpr(self.myexpr,None) self.extExprW=ExtendedExpr(self.myexprw,None) self.extExprQ=ExtendedExpr(self.myexprq,None) self.extExprS=ExtendedExpr(self.myexprs,None) self.bilingualPhraseSet=BilingualPhraseSet() self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1") self.mwe1=ParallelMWE() self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )") self.mwe2=ParallelMWE() self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )") self.bilphrase=BilingualExpr() self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True) synDict=dict() synDict["politics_N"]=set(["policy_N"]) ParallelMWE.synonymDict=synDict
def _condition_subtree(api_response): try: condition = api_response["weather"][0]["description"] function_name = condition.replace(" ","_") return pgf.readExpr(function_name) except KeyError: return pgf.readExpr("UnknownCondition")
def testPrint(self): strrep=str(self.extExpr) myexpragain=pgf.readExpr(strrep) self.assertEqual(str(self.myexpr), str(myexpragain)) strrep=str(self.extExprS) myexpragain=pgf.readExpr(strrep) self.assertEqual(str(self.myexprs), str(myexpragain))
def parse(self, rawstr, ignoreFreq=False): parts = rawstr.split(" | ") offset = -1 if not ignoreFreq: self.freq = int(parts[0]) offset = 0 self.slexpr = ExtendedExpr(pgf.readExpr(parts[1 + offset]), None) self.slexpr.compute_leaf_functions_recursively() self.tlexpr = ExtendedExpr(pgf.readExpr(parts[2 + offset]), None) self.tlexpr.compute_leaf_functions_recursively()
def test_linearize_rus(rus, linearized, parsed, only): if only and "l" not in only: return e = pgf.readExpr(parsed) assert unicode(rus.linearize(e), "utf-8").replace(" .", ".") == linearized.replace(" .", ".")
def _temperature_subtree(api_response): tree = pgf.readExpr("UnknownNum") try: kelvin = api_response["main"]["temp"] tree = float_tree(kelvin-273.15) except KeyError: pass return tree
def _float_subtree(api_response,keys): tree = pgf.readExpr("UnknownNum") try: value = lookup_value(api_response,keys) tree = float_tree(value) except KeyError: pass return tree
def parse_line(line): try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); except ValueError: print("Line not in proper format: %s" %(line), file=stderr); parseprob, abstree = parserepr.split('\t') if parserepr.strip() \ else (0, ''); return ((int(sentid), float(parsetime), float(parseprob), \ pgf.readExpr(abstree) if abstree else None));
def parse_line(line): try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2) except ValueError: print("Line not in proper format: %s" % (line), file=stderr) parseprob, abstree = parserepr.split('\t') if parserepr.strip() \ else (0, '') return ((int(sentid), float(parsetime), float(parseprob), \ pgf.readExpr(abstree) if abstree else None))
def regression_test(grammar, treelin_pairs, lang_code): failures = [] conc_grammar = grammar.languages[grammar.abstractName + lang_code] for (tree, lin) in treelin_pairs: expr = pgf.readExpr(tree) genlin = conc_grammar.linearize(expr) if not genlin == lin: failures.append((tree, lin, genlin)) return failures
def country_texts_embedded(factsys, data): factsys.grammar.embed("G") import G fields = factsys.fieldnames.split() facts = [] for tuple in data: countr = factsys.str2exp("Name", tuple[0]) cap = factsys.str2exp('Name', tuple.capital) cont = factsys.str2exp('Name', tuple.continent).unpack()[1][0] #CDNAME curr = factsys.str2exp('Name', tuple.currency) pop = pgf.readExpr(str(tuple.population)) are = pgf.readExpr(str(tuple.area)) doc = G.OneSentenceDoc( G.FactSentence( G.KindFact( G.NameObject(countr), G.ModifierKind( G.PropertyKind(G.cdProperty(cont), G.country_Kind), G.NumericKindModifier(G.IntNumeric(pop), G.inhabitant_Kind))))) doc = G.AddSentenceDoc( doc, G.FactSentence( G.AttributeFact(G.area_Attribute, G.PronObject(countr), G.NumericValue(G.IntNumeric(are))))) doc = G.AddSentenceDoc( doc, G.ConjSentence( G.FactSentence( G.AttributeFact(G.capital_Attribute, G.NameObject(countr), G.NameValue(cap))), G.FactSentence( G.AttributeFact(G.currency_Attribute, G.PronObject(countr), G.NameValue(curr))))) facts.append(doc) return facts
def _winddirection_subtree(api_response): try: degrees = api_response["wind"]["deg"] except KeyError: return pgf.readExpr("UnknownDir") if degrees < 22.5 or degrees > 337.5: return pgf.readExpr("North") if degrees < 67.5: return pgf.readExpr("NorthEast") if degrees < 112.5: return pgf.readExpr("East") if degrees < 157.5: return pgf.readExpr("SouthEast") if degrees < 202.5: return pgf.readExpr("South") if degrees < 247.5: return pgf.readExpr("SouthWest") if degrees < 292.5: return pgf.readExpr("West") if degrees < 337.5: return pgf.readExpr("NorthWest") return pgf.readExpr("North")
def main(): gr = pgf.readPGF("Numeral.pgf") eng = list(gr.languages.values())[0] n = input() if n: ns = [int(n)] else: ns = [ 1, 2, 8, 10, 11, 20, 21, 40, 95, 100, 101, 234, 1000, 1001, 2021, 630511 ] for n in ns: print(n, eng.linearize(int2numeral_tree(pgf.readExpr(str(n)))))
def listen(): msg = request.args.get('msg', '') if msg == 'START': session['cur_state'] = STATE_IDLE return cgr.printName('Order') if session['cur_state'] == STATE_IDLE: org_order = cgr.parse(msg).next()[1] session['unpacked_order'] = unpack(org_order) session['cur_state'] = STATE_FILLING if session['cur_state'] == STATE_RECEIVING: parsed_msg = cgr.parse(msg, cat=pgf.Type(session['cur_type'])).next()[1] cur_t = traverse(session['unpacked_order'], session['cur_tid'][:-1]) cur_t[1][session['cur_tid'][-1]] = unpack(parsed_msg) session['cur_state'] = STATE_FILLING if session['cur_state'] == STATE_FILLING: session['cur_tid'] = fill_missing(session['unpacked_order']) print(session['cur_tid']) if session['cur_tid'] is None: session['cur_state'] = STATE_DONE else: cur_t = traverse(session['unpacked_order'], session['cur_tid'][:-1]) session['cur_state'] = STATE_RECEIVING session['cur_type'] = str(gr.functionType(cur_t[0]).unpack()[0][session['cur_tid'][-1]][2]) print(session['cur_type']) if session['cur_type'] == 'ListItem': return cgr.printName(session['cur_type']) else: pre_str = cgr.linearize(pgf.readExpr(repack(cur_t))).replace('? ', '') return 'For ' + pre_str + '<br/>' + cgr.printName(session['cur_type']) if session['cur_state'] == STATE_DONE: confirm_order = ('confirm', session['unpacked_order'][1]) confirm = cgr.linearize(pgf.readExpr(repack(confirm_order))) session['cur_state'] = STATE_IDLE return confirm
def translateWord(grammar, language, tgtlanguages, word): possible_translations = translateWordsAsChunks(grammar, language, tgtlanguages, word); if len(possible_translations): return possible_translations; lowerword = word.lower(); try: partialExprList = grammar.languages[language].parse(word, cat='Chunk'); for expr in partialExprList: return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize(expr[1]) )) for lang in tgtlanguages]; except pgf.ParseError: morphAnalysis = grammar.languages[language].lookupMorpho(word) + grammar.languages[language].lookupMorpho(lowerword); for morph in morphAnalysis: countPositiveLanguages = filter(None, [grammar.languages[lang].hasLinearization(morph[0]) for lang in tgtlanguages]); if len(countPositiveLanguages) > 0.5*len(tgtlanguages): return [(lang, gf_utils.gf_postprocessor( grammar.languages[lang].linearize( pgf.readExpr(morph[0]) ) )) for lang in tgtlanguages]; return [(lang, word) for lang in tgtlanguages];
def pgf_linearize(args): grammar = pgf.readPGF(args.pgfgrammar); outputPrinter = postprocessor; inputSet = []; for line in args.inputstream: try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); except ValueError: print line.strip(); parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, ''); inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) ); linearizer = grammar.languages[args.tgtlang].linearize; for sentid, _, _, abstree in inputSet: if abstree: print >>args.outputstream, str(outputPrinter(linearizer(abstree))); else: print >>args.outputstream, ""; return;
def generate_image(tree, grammar, filenamebase, outdirpath): imagepath = os.path.join(outdirpath, "images") try: os.makedirs(imagepath) except FileExistsError: pass expr = pgf.readExpr(tree) dotfilepath = os.path.join(imagepath, filenamebase + '.dot') pngfilename = filenamebase + '.png' pngfilepath = os.path.join(imagepath, pngfilename) dotfile = open(dotfilepath, 'w') dotfile.write(grammar.graphvizAbstractTree(expr)) dotfile.close() subprocess.run(["dot", "-Tpng", dotfilepath, "-o", pngfilepath]) return pngfilename
def readJohnsonRerankerTrees(inputStream): endOfParse = False; while True: sentheader = inputStream.next(); if sentheader == '': break; parsescount, sentidx = map(int, sentheader.strip().split()); parsesBlock = []; for i in xrange(parsescount): parseprob = inputStream.next(); if parseprob.strip() == '': endOfParse = True; break; parse = inputStream.next(); parsesBlock.append( (float(parseprob.strip()), pgf.readExpr(parse.strip())) ); yield sentidx, parsesBlock; if not endOfParse: _ = inputStream.next(); endOfParse = False;
def gen_translations(args, lang_source, lang_target, line): if args.type == 'gold': fields = line.split('\t') if fields[2] == '-1': yield fields[0],fields[1],fields[2],fields[3] else: tree = pgf.readExpr(fields[3]) utt_target = lang_target.linearize(tree).decode('utf8') yield fields[0],utt_target,fields[2],fields[3] else: utt_source = line if args.tokenize: if args.tokenize == 'simple': utt_source = tokenize_simple(line) elif args.tokenize == 'nltk': utt_source = tokenize_nltk(line) if args.cat: for prob,tree in lang_source.parse(utt_source, n=args.n_best, cat=args.cat): utt_target = lang_target.linearize(tree).decode('utf8') yield utt_source,utt_target,prob,tree else: for prob,tree in lang_source.parse(utt_source, n=args.n_best): utt_target = lang_target.linearize(tree).decode('utf8') yield utt_source,utt_target,prob,tree
#!/usr/bin/env python # coding=utf-8 # -*- encoding: utf-8 -*- import sys,pgf,argparse from lib.abstractLearningLib import ExtendedExpr if __name__ == "__main__": parser = argparse.ArgumentParser(description='prints only fun name.') parser.add_argument('--offset',default='0') args = parser.parse_args(sys.argv[1:]) offset=int(args.offset) for line in sys.stdin: parts = line.split("|") sl=parts[0+offset].strip() tl=parts[1+offset].strip() slrawexpr=pgf.readExpr(sl) tlrawexpr=pgf.readExpr(tl) slexpr=ExtendedExpr(slrawexpr,None) tlexpr=ExtendedExpr(tlrawexpr,None) print " | ".join( [ parts[i].strip() for i in range(offset)] + [slexpr.str_with_children_fun(),tlexpr.str_with_children_fun()])
out = [] for a in argx: out.extend(get_terminals(a)) return out def get_type(gr, fun): try: return gr.functionType(fun).cat except KeyError: return None if __name__ == "__main__": gr = pgf.readPGF('../data/translate-pgfs/TranslateEng.pgf') eng = gr.languages['TranslateEng'] lin = lambda fun: eng.linearize(pgf.readExpr(fun)) with open('../data/treebanks/rgl-api-trees.txt') as f: trees = [l.strip() for l in f] alts = [] for tree in trees: exp = pgf.readExpr(tree) terms = get_terminals(exp) lins = [eng.linearize(pgf.readExpr(w)) for w in terms] alts.append({ s: { x for x,_,_ in eng.lookupMorpho(l) if get_type(gr, x) == get_type(gr, s) if lin(x) == lin(s) }
treebank = json.loads(treebank_str) import pgf gr = pgf.readPGF(args.grammar) conc = gr.languages[args.concrete] results = [] times = [] import time for o in treebank: tree = o["Abs"] gold = o["Afr"] try: e = pgf.readExpr(tree) start = time.clock() l = conc.linearize(e) end = time.clock() if unicode(l,'utf8') == gold: results.append((o,l,"s",end - start)) else: results.append((o,l,"f",end - start)) except pgf.PGFError: pass summary_str = print_summary(args.grammar, args.concrete, results) result_file = codecs.open(args.result, 'w', 'utf8') result_file.write(summary_str) for (o,l,r,t) in results:
def string_year(s): return pgf.Expr('inYearDate', [pgf.readExpr(s[:4])]) # just the year part
if DEBUG: print >> sys.stderr, "source trees:" for t in sourceTreesRaw: print >> sys.stderr, t print >> sys.stderr, "target trees:" for t in targetTreesRaw: print >> sys.stderr, t #ignore trees with non-ascii characters, which make the GF bindigs crash sourceExprs=[] for rawTree in sourceTreesRaw: try: ### ignore non ascii data #### rawTree.decode('ascii') expr=pgf.readExpr(rawTree) sourceExprs.append(expr) except (pgf.PGFError,UnicodeDecodeError): print >> sys.stderr, "Could not parse SL expr: "+rawTree targetExprs=[] for rawTree in targetTreesRaw: try: ### ignore non ascii data #### rawTree.decode('ascii') expr=pgf.readExpr(rawTree) targetExprs.append(expr) except (pgf.PGFError,UnicodeDecodeError): print >> sys.stderr, "Could not parse TL expr: "+rawTree #ignore trees which cannot be linearized, since we won't be able to align them
gr = read_gf(PORTABLE_GRAMMAR_FILE) print("Languages: {}".format(", ".join(gr.languages.keys()))) eng = gr.languages["CrudEng"] fin = gr.languages["CrudFin"] rus = gr.languages["CrudRus"] PARSE_EXAMPLE = u"штука получилась ." print(u"Parse: {}".format(PARSE_EXAMPLE)) parse_iter = rus.parse(PARSE_EXAMPLE.encode("utf-8")) expr = [k for k in parse_iter][0][1] expr_str = str(expr) e = pgf.readExpr(expr_str) print(u"Linearized: {}".format(unicode(eng.linearize(e), "utf-8"))) PARSE_EXAMPLE_2 = u"штука получилась" print(u"Parse: {}".format(PARSE_EXAMPLE_2)) try: parse_iter = rus.parse(PARSE_EXAMPLE_2.encode("utf-8")) except pgf.ParseError as x: print("expected ERROR: {}".format(x)) print("Lexicons") for lang in eng, fin, rus: print("\n{}\n".format(lang.name)) print(", ".join(entry[0] for entry in lang.fullFormLexicon()))
def replace_subtree_with_MWE_ref(self, subtreeindex, mweid=0): self.children[subtreeindex]=ExtendedExpr(pgf.readExpr(ExtendedExpr.WILDCARD_SUBTREE_PREFIX+str(mweid)),None)
import pgf import sys import sets import readline import locale sys.stdout.write("loading...") sys.stdout.flush() gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") sys.stdout.write("\n") source_lang = gr.languages["ParseEng"] target_lang = gr.languages["ParseBul"] we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") print source_lang.linearize(we) sys.stdout.write("start cat: " + gr.startCat + "\n\n") class Completer(): def __init__(self, lang): self.gr = lang def complete(self, prefix, state): if state == 0: line = readline.get_line_buffer() line = line[0:readline.get_begidx()] self.i = source_lang.complete(line, prefix=prefix) self.tokens = sets.Set()
import pgf import sys import sets import readline import locale sys.stdout.write("loading...") sys.stdout.flush(); gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") sys.stdout.write("\n") source_lang = gr.languages["ParseEng"] target_lang = gr.languages["ParseBul"] we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") print source_lang.linearize(we) sys.stdout.write("start cat: "+gr.startCat+"\n\n") class Completer(): def __init__(self, lang): self.gr = lang def complete(self, prefix, state): if state == 0: line = readline.get_line_buffer() line = line[0:readline.get_begidx()] self.i = source_lang.complete(line, prefix=prefix) self.tokens = sets.Set() if len(self.tokens) > 50:
def round_int(s, r): e = len(s) - r p = 10**e i = (int(s) // p) * p return pgf.readExpr(str(i))
print(">>> gr = pgf.readPGF('Facts.pgf')") gr = pgf.readPGF('Facts.pgf') print('>>> print(list(gr.languages.keys()))') print(list(gr.languages.keys())) print(">>> eng = gr.languages['FactsEng']") eng = gr.languages['FactsEng'] print(">>> attr = pgf.Expr('area_Attribute',[])") attr = pgf.Expr('area_Attribute', []) print(">>> eng.linearize(attr)") print(eng.linearize(attr)) print(">>> obj = pgf.readExpr('NameObject (StringName \"France\")')") obj = pgf.readExpr('NameObject (StringName "France")') print(">>> val = pgf.readExpr('123')") val = pgf.readExpr('123') print(">>> fact = pgf.Expr('AttributeFact',[attr,obj,val])") fact = pgf.Expr('AttributeFact', [attr, obj, val]) print(">>> print(fact)") print(fact) print(">>> print(eng.linearize(fact))") print(eng.linearize(fact))
def string_expr(s): return pgf.readExpr(str('"' + s + '"'))
def string_value(s): return pgf.Expr('StringValue', [pgf.readExpr(str('"' + s + '"'))])
def mkInt(s): return (pgf.readExpr(str(s)))