def __init__(self, **kwargs): self.__dict__.update(kwargs) self.in_grammar = list(pgf.readPGF( self.language).languages.values())[0] self.out_grammar = list( pgf.readPGF(self.translate).languages.values())[0] if self.verbose: logging.basicConfig(level=logging.DEBUG)
def main(): countries = get_countries() continents = {c.continent for c in countries} worldtree = continent_article(countries, (lambda x: True), "world") continenttrees = [ continent_article(countries, (lambda x: x.continent == c), c) for c in continents ] countrytrees = [country_article(c) for c in countries] gr = pgf.readPGF("Countries.pgf") langs = list(gr.languages.values()) links = ' '.join([ '<a href="py_' + lang.name + '.html">' + lang.name + '</a>' for lang in langs ]) for lang in langs: text = [] text.append(links) text.append(lang.linearize(worldtree)) for tree in continenttrees: text.append(lang.linearize(tree)) for tree in countrytrees: text.append(lang.linearize(tree)) file = open("py_" + lang.name + ".html", "w") file.write('\n<p>'.join(text)) file.close()
def pgf_linearize(args): grammar = pgf.readPGF(args.pgfgrammar) def parse_line(line): try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2) except ValueError: print("Line not in proper format: %s" % (line), file=stderr) parseprob, abstree = parserepr.split('\t') if parserepr.strip() \ else (0, '') return ((int(sentid), float(parsetime), float(parseprob), \ pgf.readExpr(abstree) if abstree else None)) #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(parse_line, (line for line in args.inputstream)) outputPrinter = postprocessor linearizer = grammar.languages[args.tgtlang].linearize for sentid, _, _, abstree in inputSet: if abstree: print(str(outputPrinter(linearizer(abstree))), \ file=args.outputstream) else: print("", file=args.outputstream) return
def main(): gr = pgf.readPGF('Countries.pgf') factsys = FactSystem('country capital area population continent currency', gr, 'CountriesEng') factsys.run('../data/countries.tsv', world_texts)
def compile_grammar(name, abstract, instances): with TemporaryDirectory() as tmpdir: logger.debug("Created temp dir: {}".format(tmpdir)) abstract_path = "{0}/{1}.gf".format(tmpdir, name) with open(abstract_path, "w") as f: logger.debug("Wrote tmp file: {}".format(abstract_path)) f.write(abstract["content"]) concrete_grammars = list( compile_concrete_grammar(tmpdir, name, instances)) logger.info("Compiling") cmd = "gf -i /opt/gf/lang-utils/ -i /opt/gf/concept-net/ --output-dir={path} -make {abstract} {other}".format( abstract=abstract_path, path=tmpdir, other=" ".join(concrete_grammars)) logger.debug("Compile command: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (result, error) = proc.communicate() if proc.returncode != 0: logger.error(error) return None else: logger.debug("Compiled successfuly! Message: {}".format(result)) grammar = pgf.readPGF("{0}/{1}.pgf".format(tmpdir, name)) logger.debug("Languages: {}".format(grammar.languages)) return grammar
def main(): gr = pgf.readPGF('Nobel.pgf') factsys = FactSystem('winner sex prize year born city country dead', gr, 'NobelEng') factsys.run('../data/nobel.tsv', nobel_texts)
def compile_grammar(name, content): with TemporaryDirectory() as tmpdir: logger.debug("Created temp dir: {}".format(tmpdir)) files = [ "{0}/{1}.gf".format(tmpdir, k) for k in content.keys() if k != name ] for k, v in content.items(): with open("{0}/{1}.gf".format(tmpdir, k), "w", encoding="UTF-8") as f: try: f.write(v.decode('utf-8')) except UnicodeEncodeError: f.write(v) logger.info("Compiling") cmd = "gf -i /opt/gf/lang-utils/ -i /opt/gf/concept-net/ --output-dir={path} -make {files} {main}".format( path=tmpdir, main="{0}/{1}.gf".format(tmpdir, name), files=" ".join(files)) logger.debug("Compile command: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (result, error) = proc.communicate() if proc.returncode != 0: logger.error(error) return None, error else: logger.debug("Compiled successfuly! Message: {}".format(result)) grammar = pgf.readPGF("{0}/{1}.pgf".format(tmpdir, name)) logger.debug("Languages: {}".format(grammar.languages)) return grammar, None
def main(): dictionary = readDictTable(sys.argv[1]); if len(sys.argv) > 2: import pgf; gfGrammar, langname = pgf.readPGF(sys.argv[2]), sys.argv[3]; dictEntries_GFmap = getGFAbstractNames(dictionary, gfGrammar.languages[langname]); else: gfGrammar = None; gf_entries = readGFEntries(sys.stdin); pool = multiprocessing.Pool(3); for line in gf_entries: #entry = findExactEntryforAbstractFunction(line.strip(), dictionary, gfGrammar); entry = findClosestEntrytoAbstractFunction(line.strip(), dictionary, pool=pool); if entry == None: print '%s\tNone' %(line.strip()); else: matchedLemma = entry[0]; lemmaCategory = '%s_%s' %(entry[1][0], entry[1][1]); lemmaCategory = lemmaCategory.strip('_'); for idx, sense in enumerate(entry[2]): lemma, category = line.strip().rsplit('_', 1); newname = '%s_%d_%s' %(lemma, idx+1, category) if len(entry[2]) > 1 else line; print '%s\t%s\t%s\t%s\t%s\t%s' %(line, newname, matchedLemma, lemmaCategory, sense[0], sense[1]); gf_entries.send('finished'); return;
def readPGF(pgfPath, verbose=False): if verbose: print "loading " + pgfArg + "...", sys.stdout.flush() p = pgf.readPGF(pgfPath) if verbose: print "done" return p
def main(): gr = pgf.readPGF(pgf_file) countries = get_countries(country_file) langs = list(gr.languages.values()) for lang in langs: text = [] for c in countries: for t in country_facts(c): text.append(lang.linearize(t)) print('\n'.join(text))
def main(): "execute one line of input, quit by second Enter" win = GraphWin("GF Draw", 1000, 1000) gr = pgf.readPGF(absmodule + ".pgf") eng = gr.languages[langname] line = input() px = eng.parse(line.lower()) p,tree = px.__next__() execute(tree,win) input()
def main(): # read in the grammar, set up to and from languages grammar = pgf.readPGF(absmodule + ".pgf") fromlang = grammar.languages[fromname] tolang = grammar.languages[toname] # read a line of input, parse in "from" language, linearize in "to" language line = input("") parseresult = fromlang.parse(line) prob, tree = parseresult.__next__() print(tolang.linearize(tree))
def _preprocess(grammar, prefix=DEFAULT_PREFIX, name=DEFAULT_NAME, override=False): name_ = export(grammar, prefix, name, override) return_code = compile_gf_grammar(prefix, name_) if return_code != 0: print("Grammar could not be compiled! (return code", return_code, ")") gf_grammar = pgf.readPGF(os.path.join( prefix, name_ + COMPILED_SUFFIX)).languages[name_ + LANGUAGE] return gf_grammar
def pgf_klinearize(args): grammar = pgf.readPGF(args.pgfgrammar); outputPrinter = printMosesNbestFormat; inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; sentIdsList = imap(itemgetter(0), inputSet); parsesBlocks = map(itemgetter(1), inputSet); for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): strTrans = str(outputPrinter(transBlock, sentIdsList)); if strTrans: print >>args.outputstream, strTrans; return;
def main(): "main function to be called from stdio" # read in the grammar, set up the input language grammar = pgf.readPGF(absmodule + ".pgf") lang = grammar.languages[langname] # read a line of input, parse in lang, return answer line = input("") parseresult = lang.parse(line) prob, tree = parseresult.__next__() print(answer(tree))
def parse_text(name, content, text): error = None if content: (grammar, error) = compile_grammar(name, content) else: grammar = pgf.readPGF("/grammars/LangEng.pgf") logger.debug("Grammar: {}".format(grammar)) if grammar: logger.info("Parsing") return [(k, [str(e) for p, e in concrete.parse(text)]) for k, concrete in grammar.languages.items()] elif error: raise GFError(error)
def main(): gr = pgf.readPGF("Numeral.pgf") eng = list(gr.languages.values())[0] n = input() if n: ns = [int(n)] else: ns = [ 1, 2, 8, 10, 11, 20, 21, 40, 95, 100, 101, 234, 1000, 1001, 2021, 630511 ] for n in ns: print(n, eng.linearize(int2numeral_tree(pgf.readExpr(str(n)))))
def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar); import translation_pipeline; preprocessor = lexer(); inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1); parser = getKBestParses(grammar, args.srclang, 1); sentidx = 0; for time, parsesBlock in imap(parser, inputSet): sentidx += 1; print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''); return;
def pgf_klinearize(args): grammar = pgf.readPGF(args.pgfgrammar); #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = [(sentid, parsesBlock) \ for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]; outputPrinter = printMosesNbestFormat; sentIdsList = map(itemgetter(0), inputSet); parsesBlocks = map(itemgetter(1), inputSet); for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): strTrans = str(outputPrinter(transBlock, sentIdsList)); if strTrans: print(strTrans, file=args.outputstream); return;
def pgf_klinearize(args): grammar = pgf.readPGF(args.pgfgrammar) #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = [(sentid, parsesBlock) \ for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)] outputPrinter = printMosesNbestFormat sentIdsList = map(itemgetter(0), inputSet) parsesBlocks = map(itemgetter(1), inputSet) for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K): strTrans = str(outputPrinter(transBlock, sentIdsList)) if strTrans: print(strTrans, file=args.outputstream) return
def main(): args = get_args() gr = pgf.readPGF(args.pgf) try: lang_source = gr.languages[args.source] lang_target = gr.languages[args.target] except KeyError as e: print_utf8('Error: no such language: {0}'.format(e), file=sys.stderr) exit(1) for raw_line in sys.stdin: line = raw_line.decode('utf8').strip() try: for utt_source,utt_target,prob,tree in gen_translations(args, lang_source, lang_target, line): print_utf8("{0}\t{1}\t{2}\t{3}".format(utt_source, utt_target, prob, tree)) except pgf.ParseError as e: print_utf8('{0}\t{1}\t{2}\t{3}'.format(line, '', -1, e))
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar); import translation_pipeline; preprocessor = lexer(); inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) ); outputPrinter = printJohnsonRerankerFormat; parser = getKBestParses(grammar, args.srclang, args.K); sentidx = 0; for time, parsesBlock in imap(parser, inputSet): sentidx += 1; strParses = str(outputPrinter(parsesBlock)); if not (strParses == '\n'): print >>args.outputstream, strParses; return;
def main(): "main function to be called from stdio, language code as optional argument" # read in the grammar, set up the input language grammar = pgf.readPGF(absmodule + ".pgf") langcode = langname if len(sys.argv) > 1: langcode = absmodule + sys.argv[1] lang = grammar.languages[langcode] # read a line of input, parse in lang, return answer line = input("") parseresult = lang.parse(line) prob,tree = parseresult.__next__() # print(tree) ## debugging print(answer(tree)) for r in parseresult: print("WARNING: ambiguous")
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar); preprocessor = Lexer().tokenize; #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(preprocessor, args.inputstream); web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize; inputSet = map(web_preprocessor, inputSet); outputPrinter = printJohnsonRerankerFormat; parser = getKBestParses(grammar, args.srclang, args.K); sentidx = 0; for time, parsesBlock in map(parser, inputSet): sentidx += 1; strParses = str(outputPrinter(parsesBlock)); if not (strParses == '\n'): print(strParses, file=args.outputstream); return;
def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar) preprocessor = Lexer().tokenize #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(preprocessor, args.inputstream) web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize inputSet = map(web_preprocessor, inputSet) outputPrinter = lambda X: "%f\t%s" % (X[0], str(X[1])) parser = getKBestParses(grammar, args.srclang, 1) sentidx = 0 for time, parsesBlock in map(parser, inputSet): sentidx += 1 print("%d\t%f\t%s" %(sentidx, time, \ str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \ file=args.outputstream) return
def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar); preprocessor = Lexer().tokenize; #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(preprocessor, args.inputstream); web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize; inputSet = map(web_preprocessor, inputSet); outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); parser = getKBestParses(grammar, args.srclang, 1); sentidx = 0; for time, parsesBlock in map(parser, inputSet): sentidx += 1; print("%d\t%f\t%s" %(sentidx, time, \ str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \ file=args.outputstream); return;
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar) preprocessor = Lexer().tokenize #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(preprocessor, args.inputstream) web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize inputSet = map(web_preprocessor, inputSet) outputPrinter = printJohnsonRerankerFormat parser = getKBestParses(grammar, args.srclang, args.K) sentidx = 0 for time, parsesBlock in map(parser, inputSet): sentidx += 1 strParses = str(outputPrinter(parsesBlock)) if not (strParses == '\n'): print(strParses, file=args.outputstream) return
def pgf_linearize(args): grammar = pgf.readPGF(args.pgfgrammar); outputPrinter = postprocessor; inputSet = []; for line in args.inputstream: try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); except ValueError: print line.strip(); parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, ''); inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) ); linearizer = grammar.languages[args.tgtlang].linearize; for sentid, _, _, abstree in inputSet: if abstree: print >>args.outputstream, str(outputPrinter(linearizer(abstree))); else: print >>args.outputstream, ""; return;
def main(): "initialize with a window, process Command input line by line; optional language argument" win = GraphWin("GF Draw", hsize, vsize) refs = [] latest = "it" gr = pgf.readPGF(absmodule + ".pgf") lang = langname if len(sys.argv) > 1: lang = absmodule + sys.argv[1] eng = gr.languages[lang] while True: try: line = input("") except EOFError: break if not (line): pass else: try: px = eng.parse(line.lower()) p, tree = px.__next__() key, obj, co = execute(tree, win) if co == "drawCommand": refs = addRef(key, obj, refs) elif co == "removeCommand": ref, sh, i = lookupRef(key, refs) if sh is None: print("no such object") else: sh.undraw() refs = removeRef(key, refs) elif co == "moveCommand": x, y = obj ref, sh, i = lookupRef(key, refs) if sh is None: print("no such object") else: sh.move(x, y) refs = [(ref, sh)] + refs[:i] + refs[i + 1:] else: print("nothing else can be done") print(refs) ## debugging except pgf.ParseError: print("# NO PARSE", line)
def main(): if len(sys.argv) < 4: print( " usage: python3 translator.py <pgf-file-prefix> <from-lang-suffix> <to-lang-suffix> -debug?" ) print(" e.g. python3 translator.py CSETranslator Eng Swe") print(" The program reads and writes stdio line by line, e.g") print( " cat ../course_plans/TIN214Eng.txt | python3 translator.py CSEShallow Eng Swe" ) return pgfm = sys.argv[1] pgff = pgfm + ".pgf" ceng = pgfm + sys.argv[2] cswe = pgfm + sys.argv[3] debug = len(sys.argv) == 5 and sys.argv[4] == "-debug" gr = pgf.readPGF(pgff) eng = gr.languages[ceng] swe = gr.languages[cswe] print("#", "translating with", pgff, "from", ceng, "to", cswe) while True: try: line = input("") except EOFError: break tr = trim(line) (ms, ts) = markMissing(eng, tr) t = ' '.join(ts) if not (t): pass else: try: px = eng.parse(t, heuristics=0.2, callbacks=[("Symb", lambda w: recognizeSymb(w)) ]) p, e = px.__next__() if debug: print("# TREE", e) print(swe.linearize(e)) except pgf.ParseError: print("# NO PARSE", t) if debug: print("# MISSING", ms)
def pgf_kparse(args): grammar = pgf.readPGF(args.pgfgrammar) import translation_pipeline preprocessor = lexer() inputSet = translation_pipeline.web_lexer( grammar, args.srclang, imap(preprocessor, args.inputstream)) outputPrinter = printJohnsonRerankerFormat callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))] parser = getKBestParses(grammar, args.srclang, args.K, callbacks=callbacks) sentidx = 0 for time, parsesBlock in imap(parser, inputSet): sentidx += 1 strParses = str(outputPrinter(parsesBlock)) if not (strParses == '\n'): print >> args.outputstream, strParses return
def pgf_parse(args): grammar = pgf.readPGF(args.pgfgrammar) import translation_pipeline preprocessor = lexer() inputSet = translation_pipeline.web_lexer( grammar, args.srclang, imap(preprocessor, args.inputstream)) outputPrinter = lambda X: "%f\t%s" % (X[0], str(X[1])) #operator.itemgetter(1); callbacks = [('PN', translation_pipeline.parseNames(grammar, args.srclang)), ('Symb', translation_pipeline.parseUnknown(grammar, args.srclang))] parser = getKBestParses(grammar, args.srclang, 1, callbacks) sentidx = 0 for time, parsesBlock in imap(parser, inputSet): sentidx += 1 print >> args.outputstream, "%d\t%f\t%s" % ( sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '') return
def __init__(self, grammar, input=None, save_preprocessing=None, load_preprocessing=None, heuristics=-1.0): self.grammar = grammar self._goal = None self._best = None self._heuristics = heuristics if input is not None: if grammar.tmp_gf is not None: self.gf_grammar = grammar.tmp_gf else: self.preprocess_grammar(grammar) self.gf_grammar = grammar.tmp_gf self.input = input self.parse() else: if load_preprocessing is not None: self.gf_grammar = pgf.readPGF( self.resolve_path(load_preprocessing)).languages[ load_preprocessing[1] + LANGUAGE] else: if save_preprocessing is not None: prefix = save_preprocessing[0] name = save_preprocessing[1] override = True else: prefix = DEFAULT_PREFIX name = DEFAULT_NAME override = False self.gf_grammar = self._preprocess(grammar, prefix=prefix, name=name, override=override)
def pgf_linearize(args): grammar = pgf.readPGF(args.pgfgrammar); def parse_line(line): try: sentid, parsetime, parserepr = line.strip('\n').split('\t', 2); except ValueError: print("Line not in proper format: %s" %(line), file=stderr); parseprob, abstree = parserepr.split('\t') if parserepr.strip() \ else (0, ''); return ((int(sentid), float(parsetime), float(parseprob), \ pgf.readExpr(abstree) if abstree else None)); #if sys.version_info < (3, 0): # args.inputstream = codecs.getreader('utf-8')(args.inputstream); inputSet = map(parse_line, (line for line in args.inputstream)); outputPrinter = postprocessor; linearizer = grammar.languages[args.tgtlang].linearize; for sentid, _, _, abstree in inputSet: if abstree: print(str(outputPrinter(linearizer(abstree))), \ file=args.outputstream); else: print("", file=args.outputstream); return;
def translation_pipeline(props): if props.propsfile: props = readTranslationPipelineOptions(props.propsfile, props); # UGLY HACK FOR K-best translation: if K-best translation output format is only txt if props.bestK != 1: props.format = 'txt'; if not os.path.isdir( props.exp_directory ): logging.info("Creating output directory: %s" %(props.exp_directory)); os.makedirs(props.exp_directory); if not props.srclang: logging.critical("Mandatory option source-lang missing. Can not determine source language."); sys.exit(1); grammar = pgf.readPGF(props.pgffile); sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0]; logging.info("Translating from %s" %(sourceLanguage)); if len(props.tgtlangs): target_langs = props.tgtlangs; else: target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]); targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]); logging.info("Translating into the following languages: %s" %(','.join(targetLanguages))); K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses bestK = props.bestK; if not props.input: logging.info( "Input file name missing. Reading input from stdin." ); inputStream = sys.stdin; outputPrefix = os.getpid(); else: inputStream = codecs.open(props.input, 'r'); outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0]; if props.format == 'sgm': inputDoc = etree.parse(inputStream); reader = sgmReader; skeletonDoc = getXMLSkeleton; addItem = addToSgm; writer = sgmWriter; elif props.format == 'txt': logging.info("Input format is txt. Assuming one-sentence-per-line format."); inputDoc = inputStream; reader = lambda X: X; skeletonDoc = lambda X, lang: list(); addItem = lambda X, y: list.append(X, y); writer = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X))); translationBlocks = {}; for tgtlang in targetLanguages+['abstract']: translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang); preprocessor = pipeline_lexer; postprocessor = clean_gfstrings; logging.info( "Parsing text in %s" %(sourceLanguage) ); # 1. Get Abstract Trees for sentences in source language. tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc)); absParses = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)]; logging.info( "Linearizing into %s" %(','.join(targetLanguages)) ); # 2. Linearize in all target Languages for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ): translationBuffer = {}; if not len(parsesBlock): # failed to parse; # translate using lookup for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]): if bestK == 1: addItem(translationBlocks[tgtlang], postprocessor(translation)); else: addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]); addItem(translationBlocks['abstract'], ''); else: bestTranslationIdx = 0; for tgtlang in targetLanguages: translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next(); if bestK == 1: for tidx, translation in enumerate(translationBuffer[tgtlang]): if postprocessor(translation[1]).strip(): if tidx > bestTranslationIdx: bestTranslationIdx = tidx; break; for tgtlang in targetLanguages: if bestK == 1: translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), ''); abstract = str(parsesBlock[bestTranslationIdx][1]); else: translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else []; abstract = parsesBlock; addItem(translationBlocks[tgtlang], translation); addItem(translationBlocks['abstract'], abstract); for tgtlang in targetLanguages+['abstract']: outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) ); logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) ); with codecs.open(outputFile, 'w') as outputStream: print >>outputStream, writer(translationBlocks[tgtlang]); return;
f.write("concrete DictSlv of DictSlvAbs = CatSlv ** open ParadigmsSlv, Prelude in {\n"); f.write("lin\n"); for key in sloleks: (pos,pos2,lemma,forms,version) = sloleks[key] lin = mkLin(pos,pos2,lemma,forms,version) if lin != None: f.write(" "+quote(key)+" = "+lin+" ;\n") f.write("}"); f.close() sys.stdout.write("\n") sys.stdout.write("Compiling DictSlv.gf ("+str(stage)+") ...") sys.stdout.flush() try: os.remove("DictSlvAbs.pgf") except OSError: pass subprocess.call(["gf","--make","-s","DictSlv.gf","+RTS","-K128M"]) sys.stdout.write("\n") sys.stdout.write("Checking DictSlv.gf ("+str(stage)+") ...") sys.stdout.flush() slv = pgf.readPGF("DictSlvAbs.pgf").languages["DictSlv"] count = updateVersions(sloleks,slv) sys.stdout.write(" "+str(count)+"\n") if count == 0: break stage = stage + 1
def read_gf(filepath): return pgf.readPGF(filepath)
import argparse parser = argparse.ArgumentParser() parser.add_argument('treebank', help='input file: json file with treebank') parser.add_argument('grammar', help='input file: PGF grammar file for parsing treebank') parser.add_argument('concrete', help='input: grammar module name for linearising treebank') parser.add_argument('result', help="output file: text file with results") args = parser.parse_args() import codecs import json treebank_str = codecs.open(args.treebank, 'r', 'utf8').read() treebank = json.loads(treebank_str) import pgf gr = pgf.readPGF(args.grammar) conc = gr.languages[args.concrete] results = [] times = [] import time for o in treebank: tree = o["Abs"] gold = o["Afr"] try: e = pgf.readExpr(tree) start = time.clock() l = conc.linearize(e) end = time.clock()
parser.add_argument('--source_pgf',required=True) parser.add_argument('--target_pgf',required=True) parser.add_argument('--with_bilingual_phrases',action='store_true') parser.add_argument('--create_bilingual_dictionary') parser.add_argument('--only_count_parsed_words',action='store_true') parser.add_argument('--debug', action='store_true') args = parser.parse_args(sys.argv[1:]) set_debug(args.debug) DEBUG=Debugger.is_debug_enabled() bilingualDictionary=GFProbabilisticBilingualDictionary() bilingualDictionaryInv=GFProbabilisticBilingualDictionary() #read PGFs sourcePGF=pgf.readPGF(args.source_pgf) sourceLanguage=list(sourcePGF.languages.keys())[0] targetPGF=pgf.readPGF(args.target_pgf) targetLanguage=list(targetPGF.languages.keys())[0] for line in sys.stdin: parts=line.split("~") sourcePart=parts[0] targetPart=parts[1] bilingualPhrases=BilingualPhraseSet() if args.with_bilingual_phrases: bilingualPhraseList=parts[2] for bil in bilingualPhraseList.split("\t"): bilingualPhrases.add(bil.strip())
import pgf import sys import sets import readline import locale sys.stdout.write("loading...") sys.stdout.flush(); gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") sys.stdout.write("\n") source_lang = gr.languages["ParseEng"] target_lang = gr.languages["ParseBul"] we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") print source_lang.linearize(we) sys.stdout.write("start cat: "+gr.startCat+"\n\n") class Completer(): def __init__(self, lang): self.gr = lang def complete(self, prefix, state): if state == 0: line = readline.get_line_buffer() line = line[0:readline.get_begidx()] self.i = source_lang.complete(line, prefix=prefix) self.tokens = sets.Set() if len(self.tokens) > 50: