Example #1
0
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)
        self.in_grammar = list(pgf.readPGF(
            self.language).languages.values())[0]
        self.out_grammar = list(
            pgf.readPGF(self.translate).languages.values())[0]

        if self.verbose:
            logging.basicConfig(level=logging.DEBUG)
Example #2
0
def main():
    countries = get_countries()
    continents = {c.continent for c in countries}

    worldtree = continent_article(countries, (lambda x: True), "world")
    continenttrees = [
        continent_article(countries, (lambda x: x.continent == c), c)
        for c in continents
    ]
    countrytrees = [country_article(c) for c in countries]

    gr = pgf.readPGF("Countries.pgf")
    langs = list(gr.languages.values())
    links = ' '.join([
        '<a href="py_' + lang.name + '.html">' + lang.name + '</a>'
        for lang in langs
    ])

    for lang in langs:
        text = []
        text.append(links)
        text.append(lang.linearize(worldtree))
        for tree in continenttrees:
            text.append(lang.linearize(tree))
        for tree in countrytrees:
            text.append(lang.linearize(tree))
        file = open("py_" + lang.name + ".html", "w")
        file.write('\n<p>'.join(text))
        file.close()
Example #3
0
def pgf_linearize(args):
    grammar = pgf.readPGF(args.pgfgrammar)

    def parse_line(line):
        try:
            sentid, parsetime, parserepr = line.strip('\n').split('\t', 2)
        except ValueError:
            print("Line not in proper format: %s" % (line), file=stderr)
        parseprob, abstree = parserepr.split('\t') if parserepr.strip() \
            else (0, '')
        return ((int(sentid), float(parsetime), float(parseprob), \
            pgf.readExpr(abstree) if abstree else None))

    #if sys.version_info < (3, 0):
    #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
    inputSet = map(parse_line, (line for line in args.inputstream))
    outputPrinter = postprocessor
    linearizer = grammar.languages[args.tgtlang].linearize
    for sentid, _, _, abstree in inputSet:
        if abstree:
            print(str(outputPrinter(linearizer(abstree))), \
                file=args.outputstream)
        else:
            print("", file=args.outputstream)
    return
Example #4
0
def main():
    gr = pgf.readPGF('Countries.pgf')

    factsys = FactSystem('country capital area population continent currency',
                         gr, 'CountriesEng')

    factsys.run('../data/countries.tsv', world_texts)
Example #5
0
def compile_grammar(name, abstract, instances):
    with TemporaryDirectory() as tmpdir:
        logger.debug("Created temp dir: {}".format(tmpdir))
        abstract_path = "{0}/{1}.gf".format(tmpdir, name)
        with open(abstract_path, "w") as f:
            logger.debug("Wrote tmp file: {}".format(abstract_path))
            f.write(abstract["content"])

        concrete_grammars = list(
            compile_concrete_grammar(tmpdir, name, instances))

        logger.info("Compiling")
        cmd = "gf -i /opt/gf/lang-utils/ -i /opt/gf/concept-net/ --output-dir={path} -make {abstract} {other}".format(
            abstract=abstract_path,
            path=tmpdir,
            other=" ".join(concrete_grammars))
        logger.debug("Compile command: {}".format(cmd))
        proc = subprocess.Popen(cmd,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        (result, error) = proc.communicate()

        if proc.returncode != 0:
            logger.error(error)
            return None
        else:
            logger.debug("Compiled successfuly! Message: {}".format(result))
            grammar = pgf.readPGF("{0}/{1}.pgf".format(tmpdir, name))
            logger.debug("Languages: {}".format(grammar.languages))
            return grammar
Example #6
0
def main():
    gr = pgf.readPGF('Nobel.pgf')

    factsys = FactSystem('winner sex prize year born city country dead', gr,
                         'NobelEng')

    factsys.run('../data/nobel.tsv', nobel_texts)
Example #7
0
def compile_grammar(name, content):
    with TemporaryDirectory() as tmpdir:
        logger.debug("Created temp dir: {}".format(tmpdir))
        files = [
            "{0}/{1}.gf".format(tmpdir, k) for k in content.keys() if k != name
        ]
        for k, v in content.items():
            with open("{0}/{1}.gf".format(tmpdir, k), "w",
                      encoding="UTF-8") as f:
                try:
                    f.write(v.decode('utf-8'))
                except UnicodeEncodeError:
                    f.write(v)

        logger.info("Compiling")
        cmd = "gf -i /opt/gf/lang-utils/ -i /opt/gf/concept-net/ --output-dir={path} -make {files} {main}".format(
            path=tmpdir,
            main="{0}/{1}.gf".format(tmpdir, name),
            files=" ".join(files))
        logger.debug("Compile command: {}".format(cmd))
        proc = subprocess.Popen(cmd,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        (result, error) = proc.communicate()

        if proc.returncode != 0:
            logger.error(error)
            return None, error
        else:
            logger.debug("Compiled successfuly! Message: {}".format(result))
            grammar = pgf.readPGF("{0}/{1}.pgf".format(tmpdir, name))
            logger.debug("Languages: {}".format(grammar.languages))
            return grammar, None
Example #8
0
def main():
  dictionary = readDictTable(sys.argv[1]);
  if len(sys.argv) > 2:
    import pgf;
    gfGrammar, langname = pgf.readPGF(sys.argv[2]), sys.argv[3];
    dictEntries_GFmap = getGFAbstractNames(dictionary, gfGrammar.languages[langname]);
  else:
    gfGrammar = None;
    
  gf_entries = readGFEntries(sys.stdin);
  pool = multiprocessing.Pool(3);
  for line in gf_entries:
    #entry = findExactEntryforAbstractFunction(line.strip(), dictionary, gfGrammar);
    entry = findClosestEntrytoAbstractFunction(line.strip(), dictionary, pool=pool);
    if entry == None:
      print '%s\tNone' %(line.strip());
    else:
      matchedLemma = entry[0];
      lemmaCategory = '%s_%s' %(entry[1][0], entry[1][1]);
      lemmaCategory = lemmaCategory.strip('_');
      for idx, sense in enumerate(entry[2]):
        lemma, category = line.strip().rsplit('_', 1);
        newname = '%s_%d_%s' %(lemma, idx+1, category) if len(entry[2]) > 1 else line;
        print '%s\t%s\t%s\t%s\t%s\t%s' %(line, newname, matchedLemma, lemmaCategory, sense[0], sense[1]);
	gf_entries.send('finished');
  return;
Example #9
0
def readPGF(pgfPath, verbose=False):
    if verbose:
        print "loading " + pgfArg + "...",
        sys.stdout.flush()
    p = pgf.readPGF(pgfPath)
    if verbose:
        print "done"
    return p
Example #10
0
def readPGF(pgfPath, verbose=False):
	if verbose:
			print "loading " + pgfArg + "...",
			sys.stdout.flush()
	p = pgf.readPGF(pgfPath)
	if verbose:
			print "done"
	return p
Example #11
0
def main():
    gr = pgf.readPGF(pgf_file)
    countries = get_countries(country_file)
    langs = list(gr.languages.values())
    for lang in langs:
        text = []
        for c in countries:
            for t in country_facts(c):
                text.append(lang.linearize(t))
        print('\n'.join(text))
Example #12
0
def main():
  "execute one line of input, quit by second Enter"
  win = GraphWin("GF Draw", 1000, 1000)
  gr  = pgf.readPGF(absmodule + ".pgf")
  eng = gr.languages[langname]
  line = input()
  px = eng.parse(line.lower())
  p,tree = px.__next__()
  execute(tree,win)
  input()
def main():
    # read in the grammar, set up to and from languages
    grammar = pgf.readPGF(absmodule + ".pgf")
    fromlang = grammar.languages[fromname]
    tolang = grammar.languages[toname]

    # read a line of input, parse in "from" language, linearize in "to" language
    line = input("")
    parseresult = fromlang.parse(line)
    prob, tree = parseresult.__next__()
    print(tolang.linearize(tree))
 def _preprocess(grammar,
                 prefix=DEFAULT_PREFIX,
                 name=DEFAULT_NAME,
                 override=False):
     name_ = export(grammar, prefix, name, override)
     return_code = compile_gf_grammar(prefix, name_)
     if return_code != 0:
         print("Grammar could not be compiled! (return code", return_code,
               ")")
     gf_grammar = pgf.readPGF(os.path.join(
         prefix, name_ + COMPILED_SUFFIX)).languages[name_ + LANGUAGE]
     return gf_grammar
Example #15
0
def pgf_klinearize(args):
    grammar = pgf.readPGF(args.pgfgrammar);
    outputPrinter = printMosesNbestFormat;
    inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
    sentIdsList = imap(itemgetter(0), inputSet);
    parsesBlocks = map(itemgetter(1), inputSet);

    for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
	strTrans = str(outputPrinter(transBlock, sentIdsList));
	if strTrans:
	    print >>args.outputstream, strTrans;
    return;
Example #16
0
def pgf_klinearize(args):
    grammar = pgf.readPGF(args.pgfgrammar);
    outputPrinter = printMosesNbestFormat;
    inputSet = [(sentid, parsesBlock) for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
    sentIdsList = imap(itemgetter(0), inputSet);
    parsesBlocks = map(itemgetter(1), inputSet);

    for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
	strTrans = str(outputPrinter(transBlock, sentIdsList));
	if strTrans:
	    print >>args.outputstream, strTrans;
    return;
Example #17
0
def main():
    "main function to be called from stdio"

    # read in the grammar, set up the input language
    grammar = pgf.readPGF(absmodule + ".pgf")
    lang = grammar.languages[langname]

    # read a line of input, parse in lang, return answer
    line = input("")
    parseresult = lang.parse(line)
    prob, tree = parseresult.__next__()
    print(answer(tree))
Example #18
0
def parse_text(name, content, text):
    error = None
    if content:
        (grammar, error) = compile_grammar(name, content)
    else:
        grammar = pgf.readPGF("/grammars/LangEng.pgf")
    logger.debug("Grammar: {}".format(grammar))
    if grammar:
        logger.info("Parsing")
        return [(k, [str(e) for p, e in concrete.parse(text)])
                for k, concrete in grammar.languages.items()]
    elif error:
        raise GFError(error)
Example #19
0
def main():
    gr = pgf.readPGF("Numeral.pgf")
    eng = list(gr.languages.values())[0]
    n = input()
    if n:
        ns = [int(n)]
    else:
        ns = [
            1, 2, 8, 10, 11, 20, 21, 40, 95, 100, 101, 234, 1000, 1001, 2021,
            630511
        ]
    for n in ns:
        print(n, eng.linearize(int2numeral_tree(pgf.readExpr(str(n)))))
Example #20
0
def pgf_parse(args):
    grammar  = pgf.readPGF(args.pgfgrammar);
    import translation_pipeline;

    preprocessor = lexer();
    inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
    outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1])); #operator.itemgetter(1);
    parser = getKBestParses(grammar, args.srclang, 1);
    
    sentidx = 0;
    for time, parsesBlock in imap(parser, inputSet):
	sentidx += 1;
	print >>args.outputstream, "%d\t%f\t%s" %(sentidx, time, str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '');
    return;
Example #21
0
def pgf_klinearize(args):
  grammar = pgf.readPGF(args.pgfgrammar);
  #if sys.version_info < (3, 0):
  #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
  inputSet = [(sentid, parsesBlock) \
      for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)];
  outputPrinter = printMosesNbestFormat;
  sentIdsList  = map(itemgetter(0), inputSet);
  parsesBlocks = map(itemgetter(1), inputSet);
  
  for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks, args.K):
    strTrans = str(outputPrinter(transBlock, sentIdsList));
    if strTrans:
      print(strTrans, file=args.outputstream);
  return;
Example #22
0
def pgf_klinearize(args):
    grammar = pgf.readPGF(args.pgfgrammar)
    #if sys.version_info < (3, 0):
    #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
    inputSet = [(sentid, parsesBlock) \
        for sentid, parsesBlock in readJohnsonRerankerTrees(args.inputstream)]
    outputPrinter = printMosesNbestFormat
    sentIdsList = map(itemgetter(0), inputSet)
    parsesBlocks = map(itemgetter(1), inputSet)

    for transBlock in getKLinearizations(grammar, args.tgtlang, parsesBlocks,
                                         args.K):
        strTrans = str(outputPrinter(transBlock, sentIdsList))
        if strTrans:
            print(strTrans, file=args.outputstream)
    return
Example #23
0
def main():
    args = get_args()
    gr = pgf.readPGF(args.pgf)
    try:
        lang_source = gr.languages[args.source]
        lang_target = gr.languages[args.target]
    except KeyError as e:
        print_utf8('Error: no such language: {0}'.format(e), file=sys.stderr)
        exit(1)
    for raw_line in sys.stdin:
        line = raw_line.decode('utf8').strip()
        try:
            for utt_source,utt_target,prob,tree in gen_translations(args, lang_source, lang_target, line):
                print_utf8("{0}\t{1}\t{2}\t{3}".format(utt_source, utt_target, prob, tree))
        except pgf.ParseError as e:
            print_utf8('{0}\t{1}\t{2}\t{3}'.format(line, '', -1, e))
Example #24
0
def pgf_kparse(args):
    grammar = pgf.readPGF(args.pgfgrammar);
    import translation_pipeline;
    
    preprocessor = lexer();
    inputSet = translation_pipeline.web_lexer(grammar, args.srclang, imap(preprocessor, args.inputstream) );
    outputPrinter = printJohnsonRerankerFormat;
    parser = getKBestParses(grammar, args.srclang, args.K);

    sentidx = 0;
    for time, parsesBlock in imap(parser, inputSet):
	sentidx += 1;
	strParses = str(outputPrinter(parsesBlock));
	if not (strParses == '\n'):
	    print >>args.outputstream, strParses;
    return;
Example #25
0
def main():
  "main function to be called from stdio, language code as optional argument"
  
  # read in the grammar, set up the input language
  grammar = pgf.readPGF(absmodule + ".pgf")
  langcode = langname
  if len(sys.argv) > 1:
    langcode = absmodule + sys.argv[1]
  lang = grammar.languages[langcode]

  # read a line of input, parse in lang, return answer
  line  = input("")
  parseresult = lang.parse(line)
  prob,tree   = parseresult.__next__()
#  print(tree) ## debugging
  print(answer(tree))
  for r in parseresult: print("WARNING: ambiguous")
Example #26
0
def pgf_kparse(args):
  grammar = pgf.readPGF(args.pgfgrammar);
  preprocessor = Lexer().tokenize;
  #if sys.version_info < (3, 0):
  #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
  inputSet = map(preprocessor, args.inputstream);
  web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
  inputSet = map(web_preprocessor, inputSet);
  outputPrinter = printJohnsonRerankerFormat;
  parser = getKBestParses(grammar, args.srclang, args.K);
  
  sentidx = 0;
  for time, parsesBlock in map(parser, inputSet):
    sentidx += 1;
    strParses = str(outputPrinter(parsesBlock));
    if not (strParses == '\n'):
      print(strParses, file=args.outputstream);
  return;
Example #27
0
def pgf_parse(args):
    grammar = pgf.readPGF(args.pgfgrammar)
    preprocessor = Lexer().tokenize
    #if sys.version_info < (3, 0):
    #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
    inputSet = map(preprocessor, args.inputstream)
    web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize
    inputSet = map(web_preprocessor, inputSet)
    outputPrinter = lambda X: "%f\t%s" % (X[0], str(X[1]))
    parser = getKBestParses(grammar, args.srclang, 1)

    sentidx = 0
    for time, parsesBlock in map(parser, inputSet):
        sentidx += 1
        print("%d\t%f\t%s" %(sentidx, time, \
            str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \
            file=args.outputstream)
    return
Example #28
0
def pgf_parse(args):
  grammar  = pgf.readPGF(args.pgfgrammar);
  preprocessor = Lexer().tokenize;
  #if sys.version_info < (3, 0):
  #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
  inputSet = map(preprocessor, args.inputstream);
  web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize;
  inputSet = map(web_preprocessor, inputSet);
  outputPrinter = lambda X: "%f\t%s" %(X[0], str(X[1]));
  parser = getKBestParses(grammar, args.srclang, 1);
  
  sentidx = 0;
  for time, parsesBlock in map(parser, inputSet):
    sentidx += 1;
    print("%d\t%f\t%s" %(sentidx, time, \
        str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else ''), \
        file=args.outputstream);
  return;
Example #29
0
def pgf_kparse(args):
    grammar = pgf.readPGF(args.pgfgrammar)
    preprocessor = Lexer().tokenize
    #if sys.version_info < (3, 0):
    #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
    inputSet = map(preprocessor, args.inputstream)
    web_preprocessor = Lexer('Web', grammar, args.srclang).tokenize
    inputSet = map(web_preprocessor, inputSet)
    outputPrinter = printJohnsonRerankerFormat
    parser = getKBestParses(grammar, args.srclang, args.K)

    sentidx = 0
    for time, parsesBlock in map(parser, inputSet):
        sentidx += 1
        strParses = str(outputPrinter(parsesBlock))
        if not (strParses == '\n'):
            print(strParses, file=args.outputstream)
    return
Example #30
0
def pgf_linearize(args):
    grammar = pgf.readPGF(args.pgfgrammar);
    outputPrinter = postprocessor;
    inputSet = [];
    for line in args.inputstream:
	try:
	    sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
	except ValueError:
	    print line.strip();
	parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, '');
	inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) );
    linearizer = grammar.languages[args.tgtlang].linearize;
    for sentid, _, _, abstree in inputSet:
	if abstree:
	    print >>args.outputstream, str(outputPrinter(linearizer(abstree)));
	else:
	    print >>args.outputstream, "";
    return;
Example #31
0
def pgf_linearize(args):
    grammar = pgf.readPGF(args.pgfgrammar);
    outputPrinter = postprocessor;
    inputSet = [];
    for line in args.inputstream:
	try:
	    sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
	except ValueError:
	    print line.strip();
	parseprob, abstree = parserepr.split('\t') if parserepr.strip() else (0, '');
	inputSet.append( (int(sentid), float(parsetime), float(parseprob), pgf.readExpr(abstree) if abstree else None) );
    linearizer = grammar.languages[args.tgtlang].linearize;
    for sentid, _, _, abstree in inputSet:
	if abstree:
	    print >>args.outputstream, str(outputPrinter(linearizer(abstree)));
	else:
	    print >>args.outputstream, "";
    return;
Example #32
0
def main():
    "initialize with a window, process Command input line by line; optional language argument"
    win = GraphWin("GF Draw", hsize, vsize)
    refs = []
    latest = "it"
    gr = pgf.readPGF(absmodule + ".pgf")
    lang = langname
    if len(sys.argv) > 1:
        lang = absmodule + sys.argv[1]
    eng = gr.languages[lang]
    while True:
        try:
            line = input("")
        except EOFError:
            break
        if not (line):
            pass
        else:
            try:
                px = eng.parse(line.lower())
                p, tree = px.__next__()
                key, obj, co = execute(tree, win)
                if co == "drawCommand":
                    refs = addRef(key, obj, refs)
                elif co == "removeCommand":
                    ref, sh, i = lookupRef(key, refs)
                    if sh is None:
                        print("no such object")
                    else:
                        sh.undraw()
                        refs = removeRef(key, refs)
                elif co == "moveCommand":
                    x, y = obj
                    ref, sh, i = lookupRef(key, refs)
                    if sh is None:
                        print("no such object")
                    else:
                        sh.move(x, y)
                        refs = [(ref, sh)] + refs[:i] + refs[i + 1:]
                else:
                    print("nothing else can be done")
                print(refs)  ## debugging
            except pgf.ParseError:
                print("# NO PARSE", line)
Example #33
0
def main():
    if len(sys.argv) < 4:
        print(
            "  usage: python3 translator.py <pgf-file-prefix> <from-lang-suffix> <to-lang-suffix> -debug?"
        )
        print("  e.g. python3 translator.py CSETranslator Eng Swe")
        print("  The program reads and writes stdio line by line, e.g")
        print(
            "    cat ../course_plans/TIN214Eng.txt | python3 translator.py CSEShallow Eng Swe"
        )
        return
    pgfm = sys.argv[1]
    pgff = pgfm + ".pgf"
    ceng = pgfm + sys.argv[2]
    cswe = pgfm + sys.argv[3]
    debug = len(sys.argv) == 5 and sys.argv[4] == "-debug"
    gr = pgf.readPGF(pgff)
    eng = gr.languages[ceng]
    swe = gr.languages[cswe]
    print("#", "translating with", pgff, "from", ceng, "to", cswe)
    while True:
        try:
            line = input("")
        except EOFError:
            break
        tr = trim(line)
        (ms, ts) = markMissing(eng, tr)
        t = ' '.join(ts)
        if not (t):
            pass
        else:
            try:
                px = eng.parse(t,
                               heuristics=0.2,
                               callbacks=[("Symb", lambda w: recognizeSymb(w))
                                          ])
                p, e = px.__next__()
                if debug: print("# TREE", e)
                print(swe.linearize(e))
            except pgf.ParseError:
                print("# NO PARSE", t)
                if debug: print("# MISSING", ms)
Example #34
0
def pgf_kparse(args):
    grammar = pgf.readPGF(args.pgfgrammar)
    import translation_pipeline

    preprocessor = lexer()
    inputSet = translation_pipeline.web_lexer(
        grammar, args.srclang, imap(preprocessor, args.inputstream))
    outputPrinter = printJohnsonRerankerFormat
    callbacks = [('PN', translation_pipeline.parseNames(grammar,
                                                        args.srclang)),
                 ('Symb',
                  translation_pipeline.parseUnknown(grammar, args.srclang))]
    parser = getKBestParses(grammar, args.srclang, args.K, callbacks=callbacks)

    sentidx = 0
    for time, parsesBlock in imap(parser, inputSet):
        sentidx += 1
        strParses = str(outputPrinter(parsesBlock))
        if not (strParses == '\n'):
            print >> args.outputstream, strParses
    return
Example #35
0
def pgf_parse(args):
    grammar = pgf.readPGF(args.pgfgrammar)
    import translation_pipeline

    preprocessor = lexer()
    inputSet = translation_pipeline.web_lexer(
        grammar, args.srclang, imap(preprocessor, args.inputstream))
    outputPrinter = lambda X: "%f\t%s" % (X[0], str(X[1]))
    #operator.itemgetter(1);
    callbacks = [('PN', translation_pipeline.parseNames(grammar,
                                                        args.srclang)),
                 ('Symb',
                  translation_pipeline.parseUnknown(grammar, args.srclang))]
    parser = getKBestParses(grammar, args.srclang, 1, callbacks)

    sentidx = 0
    for time, parsesBlock in imap(parser, inputSet):
        sentidx += 1
        print >> args.outputstream, "%d\t%f\t%s" % (
            sentidx, time,
            str(outputPrinter(parsesBlock[0])) if len(parsesBlock) else '')
    return
 def __init__(self,
              grammar,
              input=None,
              save_preprocessing=None,
              load_preprocessing=None,
              heuristics=-1.0):
     self.grammar = grammar
     self._goal = None
     self._best = None
     self._heuristics = heuristics
     if input is not None:
         if grammar.tmp_gf is not None:
             self.gf_grammar = grammar.tmp_gf
         else:
             self.preprocess_grammar(grammar)
             self.gf_grammar = grammar.tmp_gf
         self.input = input
         self.parse()
     else:
         if load_preprocessing is not None:
             self.gf_grammar = pgf.readPGF(
                 self.resolve_path(load_preprocessing)).languages[
                     load_preprocessing[1] + LANGUAGE]
         else:
             if save_preprocessing is not None:
                 prefix = save_preprocessing[0]
                 name = save_preprocessing[1]
                 override = True
             else:
                 prefix = DEFAULT_PREFIX
                 name = DEFAULT_NAME
                 override = False
             self.gf_grammar = self._preprocess(grammar,
                                                prefix=prefix,
                                                name=name,
                                                override=override)
Example #37
0
def pgf_linearize(args):
  grammar = pgf.readPGF(args.pgfgrammar);
  def parse_line(line):
    try:
      sentid, parsetime, parserepr = line.strip('\n').split('\t', 2);
    except ValueError:
      print("Line not in proper format: %s" %(line), file=stderr);
    parseprob, abstree = parserepr.split('\t') if parserepr.strip() \
        else (0, '');
    return ((int(sentid), float(parsetime), float(parseprob), \
        pgf.readExpr(abstree) if abstree else None));

  #if sys.version_info < (3, 0):
  #  args.inputstream = codecs.getreader('utf-8')(args.inputstream);
  inputSet = map(parse_line, (line for line in args.inputstream));
  outputPrinter = postprocessor;
  linearizer = grammar.languages[args.tgtlang].linearize;
  for sentid, _, _, abstree in inputSet:
    if abstree:
      print(str(outputPrinter(linearizer(abstree))), \
          file=args.outputstream);
    else:
      print("", file=args.outputstream);
  return;
Example #38
0
def translation_pipeline(props):
    if props.propsfile:
	props = readTranslationPipelineOptions(props.propsfile, props);

    # UGLY HACK FOR K-best translation: if K-best translation output format is only txt
    if props.bestK != 1:
	props.format = 'txt';

    if not os.path.isdir( props.exp_directory ):
	logging.info("Creating output directory: %s" %(props.exp_directory));
	os.makedirs(props.exp_directory);
    
    if not props.srclang:
	logging.critical("Mandatory option source-lang missing. Can not determine source language.");
	sys.exit(1);
    
    grammar = pgf.readPGF(props.pgffile);
    
    sourceLanguage = filter(None, [lang if lang[-3:] == props.srclang else '' for lang in grammar.languages.keys()])[0];
    logging.info("Translating from %s" %(sourceLanguage));
    
    if len(props.tgtlangs):
	target_langs = props.tgtlangs;
    else:
	target_langs = filter(None, [lang[-3:] if lang != sourceLanguage else '' for lang in grammar.languages.keys()]);
    targetLanguages = filter(None, [lang if lang[-3:] in target_langs else '' for lang in grammar.languages.keys()]);
    logging.info("Translating into the following languages: %s" %(','.join(targetLanguages)));
    
    K = props.bestK if props.bestK != 1 else 20; # by default we look for 20 best parses
    bestK = props.bestK;

    if not props.input:
	logging.info( "Input file name missing. Reading input from stdin." );
	inputStream = sys.stdin;
	outputPrefix = os.getpid();
	
    else:
	inputStream = codecs.open(props.input, 'r');
	outputPrefix = os.path.splitext( os.path.split(props.input)[1] )[0];
    
    if props.format == 'sgm':
	inputDoc    = etree.parse(inputStream);
	reader      = sgmReader;
	skeletonDoc = getXMLSkeleton;
	addItem     = addToSgm;
	writer      = sgmWriter;
    elif props.format == 'txt':
	logging.info("Input format is txt. Assuming one-sentence-per-line format.");
	inputDoc    = inputStream;
	reader      = lambda X: X;
	skeletonDoc = lambda X, lang: list();
	addItem     = lambda X, y: list.append(X, y); 
	writer      = lambda X: ('\n'.join(X) if bestK == 1 else '\n'.join(map(gf_utils.printMosesNbestFormat, X)));
    
    translationBlocks = {};
    for tgtlang in targetLanguages+['abstract']:
	translationBlocks[tgtlang] = skeletonDoc(inputDoc, tgtlang);

    preprocessor  = pipeline_lexer;
    postprocessor = clean_gfstrings;

    logging.info( "Parsing text in %s" %(sourceLanguage) );
    # 1. Get Abstract Trees for sentences in source language.
    tokenized_sentences = itertools.imap(preprocessor, reader(inputDoc));
    absParses  = [parsesBlock for parsesBlock in pipelineParsing(grammar, sourceLanguage, web_lexer(grammar, sourceLanguage, tokenized_sentences), K)];

    logging.info( "Linearizing into %s" %(','.join(targetLanguages)) );
    # 2. Linearize in all target Languages
    for idx, parsesBlock in enumerate( itertools.imap(operator.itemgetter(1), absParses) ):
	translationBuffer = {};
	if not len(parsesBlock):
	    # failed to parse;
	    # translate using lookup
	    for tgtlang, translation in translationByLookup(grammar, sourceLanguage, targetLanguages, absParses[idx][0]):
		if bestK == 1:
		    addItem(translationBlocks[tgtlang], postprocessor(translation));
		else:
		    addItem(translationBlocks[tgtlang], [((0,), postprocessor(translation))]);
	    addItem(translationBlocks['abstract'], '');
	else:
	    bestTranslationIdx = 0;
	    for tgtlang in targetLanguages:
		translationBuffer[tgtlang] = gf_utils.getKLinearizations(grammar, tgtlang, [parsesBlock], K=bestK).next();
		if bestK == 1:
		    for tidx, translation in enumerate(translationBuffer[tgtlang]):
			if postprocessor(translation[1]).strip():
			    if tidx > bestTranslationIdx:
				bestTranslationIdx = tidx;
				break;
	    for tgtlang in targetLanguages:
		if bestK == 1:
		    translation = postprocessor(translationBuffer[tgtlang][bestTranslationIdx][1]) if len(translationBuffer[tgtlang]) > bestTranslationIdx else ((None,), '');
		    abstract = str(parsesBlock[bestTranslationIdx][1]);
		else:
		    translation = translationBuffer[tgtlang] if len(translationBuffer[tgtlang]) else [];
		    abstract = parsesBlock;
		addItem(translationBlocks[tgtlang], translation);
	    addItem(translationBlocks['abstract'], abstract);

    for tgtlang in targetLanguages+['abstract']:
	outputFile = os.path.join( props.exp_directory, '%s-%s.%s' %(outputPrefix, tgtlang[-3:] if tgtlang!='abstract' else 'abstract', props.format) );
	logging.info( "Writing translations for %s to %s" %(tgtlang, outputFile) );
	with codecs.open(outputFile, 'w') as outputStream:
	    print >>outputStream, writer(translationBlocks[tgtlang]);
    return;
Example #39
0
	f.write("concrete DictSlv of DictSlvAbs = CatSlv ** open ParadigmsSlv, Prelude in {\n");
	f.write("lin\n");
	for key in sloleks:
		(pos,pos2,lemma,forms,version) = sloleks[key]
		lin = mkLin(pos,pos2,lemma,forms,version)
		if lin != None:
			f.write("  "+quote(key)+" = "+lin+" ;\n")
	f.write("}");
	f.close()
	sys.stdout.write("\n")

	sys.stdout.write("Compiling DictSlv.gf ("+str(stage)+") ...")
	sys.stdout.flush()
	try:
		os.remove("DictSlvAbs.pgf")
	except OSError:
		pass
	subprocess.call(["gf","--make","-s","DictSlv.gf","+RTS","-K128M"])
	sys.stdout.write("\n")

	sys.stdout.write("Checking  DictSlv.gf ("+str(stage)+") ...")
	sys.stdout.flush()
	slv = pgf.readPGF("DictSlvAbs.pgf").languages["DictSlv"]
	count = updateVersions(sloleks,slv)
	sys.stdout.write(" "+str(count)+"\n")

	if count == 0:
		break

	stage = stage + 1
Example #40
0
def read_gf(filepath):
    return pgf.readPGF(filepath)
Example #41
0
 
 import argparse
 parser = argparse.ArgumentParser()
 parser.add_argument('treebank', help='input file: json file with treebank')
 parser.add_argument('grammar', help='input file: PGF grammar file for parsing treebank')
 parser.add_argument('concrete', help='input: grammar module name for linearising treebank')
 parser.add_argument('result', help="output file: text file with results")
 args = parser.parse_args()
 
 import codecs
 import json
 treebank_str = codecs.open(args.treebank, 'r', 'utf8').read()
 treebank = json.loads(treebank_str)
 
 import pgf
 gr = pgf.readPGF(args.grammar)
 conc = gr.languages[args.concrete]
 
 results = []
 times = []
 
 import time
 for o in treebank:
     tree = o["Abs"]
     gold = o["Afr"]
     
     try:
         e = pgf.readExpr(tree)
         start = time.clock()
         l = conc.linearize(e)
         end = time.clock()
    parser.add_argument('--source_pgf',required=True)
    parser.add_argument('--target_pgf',required=True)
    parser.add_argument('--with_bilingual_phrases',action='store_true')
    parser.add_argument('--create_bilingual_dictionary')
    parser.add_argument('--only_count_parsed_words',action='store_true')
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args(sys.argv[1:])

    set_debug(args.debug)
    DEBUG=Debugger.is_debug_enabled()
        
    bilingualDictionary=GFProbabilisticBilingualDictionary()
    bilingualDictionaryInv=GFProbabilisticBilingualDictionary()
    
    #read PGFs
    sourcePGF=pgf.readPGF(args.source_pgf)
    sourceLanguage=list(sourcePGF.languages.keys())[0]
    targetPGF=pgf.readPGF(args.target_pgf)
    targetLanguage=list(targetPGF.languages.keys())[0]
    
    for line in sys.stdin:
        parts=line.split("~")
        sourcePart=parts[0]
        targetPart=parts[1]
        
        bilingualPhrases=BilingualPhraseSet()
        if args.with_bilingual_phrases:
            bilingualPhraseList=parts[2]
            for bil in bilingualPhraseList.split("\t"):
                bilingualPhrases.add(bil.strip())
        
Example #43
0
File: test.py Project: Deseaus/GF
import pgf
import sys
import sets
import readline
import locale

sys.stdout.write("loading...")
sys.stdout.flush();
gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf")
sys.stdout.write("\n")

source_lang = gr.languages["ParseEng"]
target_lang = gr.languages["ParseBul"]

we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))")
print source_lang.linearize(we)

sys.stdout.write("start cat: "+gr.startCat+"\n\n")

class Completer():
	def __init__(self, lang):
		self.gr = lang
		
	def complete(self, prefix, state):
		if state == 0:
			line = readline.get_line_buffer()
			line = line[0:readline.get_begidx()]
			self.i = source_lang.complete(line, prefix=prefix)
			self.tokens = sets.Set()

		if len(self.tokens) > 50: