コード例 #1
0
ファイル: api.py プロジェクト: vthorsteinsson/Reynir
def postag_api(version=1):
    """ API to parse text and return POS tagged tokens in a verbose JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
        # Amalgamate the result into a single list of sentences
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa
        for sent in pgs:
            # Transform the token representation into a
            # nice canonical form for outside consumption
            # err = any("err" in t for t in sent)
            for t in sent:
                canonicalize_token(t)

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
コード例 #2
0
def parse_api(version=1):
    """ API to parse text and return POS tagged tokens in JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.parse_text(session,
                                                      text,
                                                      all_names=True)
        # In this case, we should always get a single paragraph back
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True,
                          result=pgs,
                          stats=stats,
                          register=register)
コード例 #3
0
def postag_api(version=1):
    """ API to parse text and return POS tagged tokens in a verbose JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except Exception:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
        # Amalgamate the result into a single list of sentences
        pa: List[List[TokenDict]] = []
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pa = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                for pg in pgs:
                    pa.extend(pg)
        for sent in pa:
            # Transform the token representation into a
            # nice canonical form for outside consumption
            # err = any("err" in t for t in sent)
            for t in sent:
                canonicalize_token(t)

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pa, stats=stats, register=register)
コード例 #4
0
ファイル: api.py プロジェクト: vthorsteinsson/Reynir
def parse_api(version=1):
    """ API to parse text and return POS tagged tokens in JSON format """
    if not (1 <= version <= 1):
        # Unsupported version
        return better_jsonify(valid=False, reason="Unsupported version")

    try:
        text = text_from_request(request)
    except:
        return better_jsonify(valid=False, reason="Invalid request")

    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.parse_text(session, text, all_names=True)
        # In this case, we should always get a single paragraph back
        if pgs:
            # Only process the first paragraph, if there are many of them
            if len(pgs) == 1:
                pgs = pgs[0]
            else:
                # More than one paragraph: gotta concatenate 'em all
                pa = []
                for pg in pgs:
                    pa.extend(pg)
                pgs = pa

    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
コード例 #5
0
ファイル: test_processors.py プロジェクト: thorunna/Greynir
    def make_tree(text: str) -> Tree:
        toklist = tokenize(text)
        fp = Fast_Parser(verbose=False)
        ip = IncrementalParser(fp, toklist, verbose=False)
        # Dict of parse trees in string dump format,
        # stored by sentence index (1-based)
        trees = OrderedDict()
        num_sent = 0
        for p in ip.paragraphs():
            for sent in p.sentences():
                num_sent += 1
                num_tokens = len(sent)
                assert sent.parse(), "Sentence does not parse: " + sent.text
                # Obtain a text representation of the parse tree
                token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
                # Create a verbose text representation of
                # the highest scoring parse tree
                tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
                # Add information about the sentence tree's score
                # and the number of tokens
                trees[num_sent] = "\n".join(
                    ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
                )
        # Create a tree representation string out of
        # all the accumulated parse trees
        tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

        tree = Tree()
        tree.load(tree_string)
        return tree
コード例 #6
0
ファイル: api.py プロジェクト: vthorsteinsson/Reynir
def analyze_api(version=1):
    """ Analyze text manually entered by the user, i.e. not coming from an article.
        This is a lower level API used by the Greynir web front-end. """
    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")
    # try:
    text = text_from_request(request)
    # except:
    #     return better_jsonify(valid=False, reason="Invalid request")
    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
コード例 #7
0
def analyze_api(version=1):
    """ Analyze text manually entered by the user, i.e. not coming from an article.
        This is a lower level API used by the Greynir web front-end. """
    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")
    # try:
    text = text_from_request(request)
    # except:
    #     return better_jsonify(valid=False, reason="Invalid request")
    with SessionContext(commit=True) as session:
        pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True)
    # Return the tokens as a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
コード例 #8
0
ファイル: postagger.py プロジェクト: vthorsteinsson/Reynir
 def gen_tokens():
     """ Generate a Greynir token sequence from a tagging result """
     ix = 0
     for t in toklist:
         if not t.txt:
             continue
         # The code below should correspond to TreeUtility._describe_token()
         d = dict(x = t.txt)
         if t.kind == TOK.WORD:
             # set d["m"] to the meaning
             pass
         else:
             d["k"] = t.kind
         if t.val is not None and t.kind not in { TOK.WORD, TOK.ENTITY, TOK.PUNCTUATION }:
             # For tokens except words, entities and punctuation, include the val field
             if t.kind == TOK.PERSON:
                 d["v"], d["g"] = TreeUtility.choose_full_name(t.val, case = None, gender = None)
             else:
                 d["v"] = t.val
         if t.kind in { TOK.WORD, TOK.ENTITY, TOK.PERSON, TOK.NUMBER, TOK.YEAR, TOK.ORDINAL, TOK.PERCENT }:
             d["i"] = tags[ix]
             ix += 1
         if t.kind == TOK.WORD and " " in d["x"]:
             # Some kind of phrase: split it
             xlist = d["x"].split()
             for x in xlist:
                 d["x"] = x
                 if x == "og":
                     # Probably intermediate word: fjármála- og efnahagsráðherra
                     yield dict(x = "og", i = "c")
                 else:
                     yield d.copy()
         elif t.kind == TOK.PERSON:
             # Split person tokens into subtokens for each name component
             xlist = d["x"].split() # Name as it originally appeared
             slist = d["v"].split() # Stem (nominal) form of name
             # xlist may be shorter than slist, but that is OK
             for x, s in zip(xlist, slist):
                 d["x"] = x
                 d["v"] = s
                 yield d.copy()
         elif t.kind == TOK.ENTITY:
             # Split entity tokens into subtokens for each name component
             xlist = d["x"].split() # Name as it originally appeared
             for x in xlist:
                 d["x"] = x
                 yield d.copy()
         # !!! TBD: Tokens such as dates, amounts and currencies
         # !!! should be split here into multiple subtokens
         else:
             yield d
コード例 #9
0
ファイル: builtin.py プロジェクト: Loknar/Greynir
def launch_search(query, session, qkey):
    """ Launch a search with the given search terms """
    pgs, _ = TreeUtility.raw_tag_toklist(
        session,
        query.token_list,  # root=_QUERY_ROOT
    )

    # Collect the list of search terms
    terms = []
    tweights = []
    fixups = []
    for pg in pgs:
        for sent in pg:
            for t in sent:
                # Obtain search stems for the tokens.
                d = dict(x=t["x"], w=0.0)
                tweights.append(d)
                # The terms are represented as (stem, category) tuples.
                stems = stems_of_token(t)
                if stems:
                    terms.extend(stems)
                    fixups.append((d, len(stems)))

    assert sum(n for _, n in fixups) == len(terms)

    if Settings.DEBUG:
        print("Terms are:\n   {0}".format(terms))

    # Launch the search and return the answers, as well as the
    # search terms augmented with information about
    # whether and how they were used
    result = Search.list_similar_to_terms(session, terms, _MAXLEN_SEARCH)

    if "weights" not in result or not result["weights"]:
        # Probably unable to connect to the similarity server
        raise RuntimeError("Unable to connect to the similarity server")

    weights = result["weights"]
    assert len(weights) == len(terms)
    # Insert the weights at the proper places in the
    # token weight list
    index = 0
    for d, n in fixups:
        d["w"] = sum(weights[index:index + n]) / n
        index += n
    return dict(answers=result["articles"], weights=tweights)
コード例 #10
0
ファイル: query.py プロジェクト: vthorsteinsson/Reynir
def launch_search(query, session, qkey):
    """ Launch a search with the given search terms """
    pgs, stats = TreeUtility.raw_tag_toklist(
        session, query.token_list(), root=_QUERY_ROOT
    )

    # Collect the list of search terms
    terms = []
    tweights = []
    fixups = []
    for pg in pgs:
        for sent in pg:
            for t in sent:
                # Obtain search stems for the tokens.
                d = dict(x=t["x"], w=0.0)
                tweights.append(d)
                # The terms are represented as (stem, category) tuples.
                stems = stems_of_token(t)
                if stems:
                    terms.extend(stems)
                    fixups.append((d, len(stems)))

    assert sum(n for _, n in fixups) == len(terms)

    if Settings.DEBUG:
        print("Terms are:\n   {0}".format(terms))

    # Launch the search and return the answers, as well as the
    # search terms augmented with information about
    # whether and how they were used
    result = Search.list_similar_to_terms(session, terms, _MAXLEN_SEARCH)
    weights = result["weights"]
    assert len(weights) == len(terms)
    # Insert the weights at the proper places in the
    # token weight list
    index = 0
    for d, n in fixups:
        d["w"] = sum(weights[index : index + n]) / n
        index += n
    return dict(answers=result["articles"], weights=tweights)
コード例 #11
0
ファイル: cmp_parse.py プロジェクト: vthorsteinsson/Reynir
 def fá_þáttun(self, setningar):
     þáttun = []
     for setning in setningar:
         with SessionContext(read_only = True) as session:
             pgs, stats = tu.parse_text_to_bracket_form(session, setning)
         if len(pgs[0]) > 1: # Greint sem margar setningar, vil sameina
             allar = ""
             for pg in pgs:
                 for þáttuð_setning in pg:
                     allar = allar + þáttuð_setning
             hrein_þáttun = self.forvinnsla(allar)
             þáttun.append(hrein_þáttun)
             continue
         for pg in pgs:
             if not pg[0]: # Tóm setning
                 þáttun.append("(M (S x))") # Default grunngreining setningar -- breytt til að Evalb þoli!
                 continue
             for þáttuð_setning in pg:
                 # Hreinsa setningu
                 hrein_þáttun = self.forvinnsla(þáttuð_setning)
                 þáttun.append(hrein_þáttun)
     return þáttun
コード例 #12
0
def _make_tree(text: str) -> Tree:
    """Tokenize and parse text, create tree representation string
    from all the parse trees, return Tree object and token JSON."""
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)

    pgs = []
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        pgs.append([])
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            assert sent.tree is not None
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
            )
            pgs[-1].append(token_dicts)
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
    tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False)

    tree = Tree()
    tree.load(tree_string)
    return tree, tokens_json
コード例 #13
0
ファイル: cmp_parse.py プロジェクト: thorunna/Greynir
 def fá_þáttun(self, setningar):
     þáttun = []
     for setning in setningar:
         with SessionContext(read_only=True) as session:
             pgs, stats = tu.parse_text_to_bracket_form(session, setning)
         if len(pgs[0]) > 1:  # Greint sem margar setningar, vil sameina
             allar = ""
             for pg in pgs:
                 for þáttuð_setning in pg:
                     allar = allar + þáttuð_setning
             hrein_þáttun = self.forvinnsla(allar)
             þáttun.append(hrein_þáttun)
             continue
         for pg in pgs:
             if not pg[0]:  # Tóm setning
                 þáttun.append(
                     "(M (S x))"
                 )  # Default grunngreining setningar -- breytt til að Evalb þoli!
                 continue
             for þáttuð_setning in pg:
                 # Hreinsa setningu
                 hrein_þáttun = self.forvinnsla(þáttuð_setning)
                 þáttun.append(hrein_þáttun)
     return þáttun
コード例 #14
0
ファイル: bracket.py プロジェクト: vthorsteinsson/Reynir
#!/usr/bin/env python

import os
import sys

# Hack to make this Python program executable from the utils subdirectory
basepath, _ = os.path.split(os.path.realpath(__file__))
_UTILS = os.sep + "utils"
if basepath.endswith(_UTILS):
    basepath = basepath[0:-len(_UTILS)]
    sys.path.append(basepath)

from settings import Settings
from db import SessionContext
from treeutil import TreeUtility as tu

Settings.read(os.path.join(basepath, "config", "Reynir.conf"))
Settings.DEBUG = False

TEXT = "Ég bý í Baugatanga 6. Hér er prófun á þáttun texta."

with SessionContext(read_only = True) as session:
    pgs, stats = tu.parse_text_to_bracket_form(session, TEXT)

for pg in pgs:
    for sent in pg:
        print(sent)

コード例 #15
0
ファイル: main.py プロジェクト: Loknar/Greynir
def tree_grid():
    """ Show a simplified parse tree for a single sentence """

    txt = request.args.get("txt", "")
    with SessionContext(commit=True) as session:
        # Obtain simplified tree, full tree and stats
        tree, full_tree, stats = TreeUtility.parse_text_with_full_tree(
            session, txt)
        if full_tree is not None:
            # Create a more manageable, flatter tree from the binarized raw parse tree
            full_tree = ParseForestFlattener.flatten(full_tree)

    # Preprocess the trees for display, projecting them to a 2d table structure
    def _wrap_build_tbl(tbl, root, is_nt_func, children_func, nt_info_func,
                        t_info_func):
        def _build_tbl(level, offset, nodelist):
            """ Add the tree node data to be displayed at a particular
                level (row) in the result table """
            while len(tbl) <= level:
                tbl.append([])
            tlevel = tbl[level]
            left = sum(t[0] for t in tlevel)
            while left < offset:
                # Insert a left margin if required
                # (necessary if we'we alread inserted a terminal at a
                # level above this one)
                tlevel.append((1, None))
                left += 1
            index = offset
            if nodelist:
                for n in nodelist:
                    if is_nt_func(n):
                        # Nonterminal: display the child nodes in deeper levels
                        # and add a header on top of them, spanning their total width
                        cnt = _build_tbl(level + 1, index, children_func(n))
                        tlevel.append((cnt, nt_info_func(n)))
                        index += cnt
                    else:
                        # Terminal: display it in a single column
                        tlevel.append((1, t_info_func(n)))
                        index += 1
            return index - offset

        return _build_tbl(0, 0, [root])

    def _normalize_tbl(tbl, width):
        """ Fill out the table with blanks so that it is square """
        for row in tbl:
            rw = sum(t[0] for t in row)
            # Right-pad as required
            while rw < width:
                row.append((1, None))
                rw += 1

    tbl = []
    full_tbl = []
    if tree is None:
        full_tree = None
        width = 0
        full_width = 0
        height = 0  # Height of simplified table
        full_height = 0  # Height of full table
    else:

        # Build a table structure for a simplified tree
        width = _wrap_build_tbl(
            tbl,
            tree,
            is_nt_func=lambda n: n["k"] == "NONTERMINAL",
            children_func=lambda n: n["p"],
            nt_info_func=lambda n: dict(n=n["n"], error=False),
            t_info_func=lambda n: n,
        )
        height = len(tbl)
        if width and height:
            _normalize_tbl(tbl, width)

        # Build a table structure for a full tree
        full_width = _wrap_build_tbl(
            full_tbl,
            full_tree,
            is_nt_func=lambda n: n.is_nonterminal,
            children_func=lambda n: n.children,
            nt_info_func=lambda n: dict(
                n=n.p.name, sc=n.score, error=n.p.has_tag("error")),
            t_info_func=lambda n: dict(t=n.p[0].name, sc=n.score, x=n.p[1].t1),
        )
        assert full_width == width
        full_height = len(full_tbl)
        if full_width and full_height:
            _normalize_tbl(full_tbl, full_width)

    return render_template(
        "treegrid.html",
        txt=txt,
        tree=tree,
        stats=stats,
        tbl=tbl,
        height=height,
        full_tbl=full_tbl,
        full_height=full_height,
    )
コード例 #16
0
ファイル: test_processors.py プロジェクト: Loknar/Greynir
def test_entities():
    text = """

       Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota.

       Danska byggingavörukeðjan Bygma hefur keypt íslenska
       verslunarfyrirtækið Húsasmiðjuna.

       Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs
       eru hluthafar í Arion banka.

       Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra.
       Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær.
       Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær.

       Primera Air var íslenskt flugfélag.
       Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag.

       Villeneuve-Loubet er franskt þorp.

       Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda.
       
       Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse"
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree,
                                                 token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree])
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val)
                          for key, val in trees.items())

    tree = Tree()
    tree.load(tree_string)

    session = SessionShim()
    tree.process(session, entities)

    session.check(("Bygma", "er", "dönsk byggingavörukeðja"))
    session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki"))
    session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður"))
    session.check(
        ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður"))
    session.check(("Primera Air", "var", "íslenskt flugfélag"))
    session.check(("Villeneuve-Loubet", "er", "franskt þorp"))
    session.check(("Valdís", "er", "ísbúð"))
    session.check(("Fosshótel", "var", "rekin með tapi"))
    session.check(("Fosshótel", "er", "stór hótelkeðja"))
    session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði"))
    session.check(("Lax", "er", "stór fiskur af ætt laxfiska"))
    session.check(("Geysir", "er", "gamall goshver"))
    session.check(("Eimskipafélag Íslands hf", "er", "skipafélag"))
    session.check(("Origo", "er", "fyrirtæki"))
    session.check(("AirBerlin", "er", "flugfélag"))

    assert session.is_empty()
コード例 #17
0
ファイル: main.py プロジェクト: vthorsteinsson/Reynir
def tree_grid():
    """ Show a simplified parse tree for a single sentence """

    txt = request.args.get("txt", "")
    with SessionContext(commit=True) as session:
        # Obtain simplified tree, full tree and stats
        tree, full_tree, stats = TreeUtility.parse_text_with_full_tree(session, txt)
        if full_tree is not None:
            # Create a more manageable, flatter tree from the binarized raw parse tree
            full_tree = ParseForestFlattener.flatten(full_tree)

    # Preprocess the trees for display, projecting them to a 2d table structure
    def _wrap_build_tbl(
        tbl, root, is_nt_func, children_func, nt_info_func, t_info_func
    ):
        def _build_tbl(level, offset, nodelist):
            """ Add the tree node data to be displayed at a particular
                level (row) in the result table """
            while len(tbl) <= level:
                tbl.append([])
            tlevel = tbl[level]
            left = sum(t[0] for t in tlevel)
            while left < offset:
                # Insert a left margin if required
                # (necessary if we'we alread inserted a terminal at a
                # level above this one)
                tlevel.append((1, None))
                left += 1
            index = offset
            if nodelist is not None:
                for n in nodelist:
                    if is_nt_func(n):
                        # Nonterminal: display the child nodes in deeper levels
                        # and add a header on top of them, spanning their total width
                        cnt = _build_tbl(level + 1, index, children_func(n))
                        tlevel.append((cnt, nt_info_func(n)))
                        index += cnt
                    else:
                        # Terminal: display it in a single column
                        tlevel.append((1, t_info_func(n)))
                        index += 1
            return index - offset

        return _build_tbl(0, 0, [root])

    def _normalize_tbl(tbl, width):
        """ Fill out the table with blanks so that it is square """
        for row in tbl:
            rw = sum(t[0] for t in row)
            # Right-pad as required
            while rw < width:
                row.append((1, None))
                rw += 1

    tbl = []
    full_tbl = []
    if tree is None:
        full_tree = None
        width = 0
        full_width = 0
        height = 0  # Height of simplified table
        full_height = 0  # Height of full table
    else:

        # Build a table structure for a simplified tree
        width = _wrap_build_tbl(
            tbl,
            tree,
            is_nt_func=lambda n: n["k"] == "NONTERMINAL",
            children_func=lambda n: n["p"],
            nt_info_func=lambda n: dict(n=n["n"], error=False),
            t_info_func=lambda n: n,
        )
        height = len(tbl)
        if width and height:
            _normalize_tbl(tbl, width)

        # Build a table structure for a full tree
        full_width = _wrap_build_tbl(
            full_tbl,
            full_tree,
            is_nt_func=lambda n: n.is_nonterminal,
            children_func=lambda n: n.children,
            nt_info_func=lambda n: dict(n=n.p.name, error=n.p.has_tag("error")),
            t_info_func=lambda n: dict(t=n.p[0].name, x=n.p[1].t1),
        )
        assert full_width == width
        full_height = len(full_tbl)
        if full_width and full_height:
            _normalize_tbl(full_tbl, full_width)

    return render_template(
        "treegrid.html",
        txt=txt,
        tree=tree,
        stats=stats,
        tbl=tbl,
        height=height,
        full_tbl=full_tbl,
        full_height=full_height,
    )
コード例 #18
0
ファイル: bracket.py プロジェクト: reynirf/Greynir
#!/usr/bin/env python

import os
import sys

# Hack to make this Python program executable from the utils subdirectory
basepath, _ = os.path.split(os.path.realpath(__file__))
_UTILS = os.sep + "utils"
if basepath.endswith(_UTILS):
    basepath = basepath[0:-len(_UTILS)]
    sys.path.append(basepath)

from settings import Settings
from db import SessionContext
from treeutil import TreeUtility as tu

Settings.read(os.path.join(basepath, "config", "Greynir.conf"))
Settings.DEBUG = False

TEXT = "Ég bý í Baugatanga 6. Hér er prófun á þáttun texta."

with SessionContext(read_only=True) as session:
    pgs, stats = tu.parse_text_to_bracket_form(session, TEXT)

for pg in pgs:
    for sent in pg:
        print(sent)
コード例 #19
0
ファイル: article.py プロジェクト: haukurb/Reynir
    def _parse(self, enclosing_session=None, verbose=False):
        """ Parse the article content to yield parse trees and annotated token list """
        with SessionContext(enclosing_session) as session:

            # Convert the content soup to a token iterable (generator)
            toklist = Fetcher.tokenize_html(self._url, self._html, session)

            bp = self.get_parser()
            ip = IncrementalParser(bp, toklist, verbose=verbose)

            # List of paragraphs containing a list of sentences containing token lists
            # for sentences in string dump format (1-based paragraph and sentence indices)
            pgs = []

            # Dict of parse trees in string dump format,
            # stored by sentence index (1-based)
            trees = OrderedDict()

            # Word stem dictionary, indexed by (stem, cat)
            words = defaultdict(int)
            num_sent = 0

            for p in ip.paragraphs():

                pgs.append([])

                for sent in p.sentences():

                    num_sent += 1
                    num_tokens = len(sent)

                    # We don't attempt to parse very long sentences (>100 tokens)
                    # since they are memory intensive (>16 GB) and may take
                    # minutest to process
                    if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse():
                        # Obtain a text representation of the parse tree
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, sent.tree, words)
                        # Create a verbose text representation of
                        # the highest scoring parse tree
                        tree = ParseForestDumper.dump_forest(
                            sent.tree, token_dicts=token_dicts)
                        # Add information about the sentence tree's score
                        # and the number of tokens
                        trees[num_sent] = "\n".join([
                            "C{0}".format(sent.score),
                            "L{0}".format(num_tokens), tree
                        ])
                    else:
                        # Error, sentence too long or no parse:
                        # add an error index entry for this sentence
                        if num_tokens > MAX_SENTENCE_TOKENS:
                            # Set the error index at the first
                            # token outside the maximum limit
                            eix = MAX_SENTENCE_TOKENS
                        else:
                            eix = sent.err_index
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, None, None, eix)
                        trees[num_sent] = "E{0}".format(eix)

                    pgs[-1].append(token_dicts)

            # parse_time = ip.parse_time

            self._parsed = datetime.utcnow()
            self._parser_version = bp.version
            self._num_tokens = ip.num_tokens
            self._num_sentences = ip.num_sentences
            self._num_parsed = ip.num_parsed
            self._ambiguity = ip.ambiguity

            # Make one big JSON string for the paragraphs, sentences and tokens
            self._raw_tokens = pgs
            self._tokens = json.dumps(pgs,
                                      separators=(",", ":"),
                                      ensure_ascii=False)

            # Keep the bag of words (stem, category, count for each word)
            self._words = words

            # Create a tree representation string out of all the accumulated parse trees
            self._tree = "".join("S{0}\n{1}\n".format(key, val)
                                 for key, val in trees.items())