コード例 #1
0
ファイル: test_processors.py プロジェクト: thorunna/Greynir
    def make_tree(text: str) -> Tree:
        toklist = tokenize(text)
        fp = Fast_Parser(verbose=False)
        ip = IncrementalParser(fp, toklist, verbose=False)
        # Dict of parse trees in string dump format,
        # stored by sentence index (1-based)
        trees = OrderedDict()
        num_sent = 0
        for p in ip.paragraphs():
            for sent in p.sentences():
                num_sent += 1
                num_tokens = len(sent)
                assert sent.parse(), "Sentence does not parse: " + sent.text
                # Obtain a text representation of the parse tree
                token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
                # Create a verbose text representation of
                # the highest scoring parse tree
                tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
                # Add information about the sentence tree's score
                # and the number of tokens
                trees[num_sent] = "\n".join(
                    ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
                )
        # Create a tree representation string out of
        # all the accumulated parse trees
        tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

        tree = Tree()
        tree.load(tree_string)
        return tree
コード例 #2
0
def _make_tree(text: str) -> Tree:
    """Tokenize and parse text, create tree representation string
    from all the parse trees, return Tree object and token JSON."""
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)

    pgs = []
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        pgs.append([])
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            assert sent.tree is not None
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
            )
            pgs[-1].append(token_dicts)
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
    tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False)

    tree = Tree()
    tree.load(tree_string)
    return tree, tokens_json
コード例 #3
0
ファイル: test_processors.py プロジェクト: Loknar/Greynir
def test_entities():
    text = """

       Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota.

       Danska byggingavörukeðjan Bygma hefur keypt íslenska
       verslunarfyrirtækið Húsasmiðjuna.

       Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs
       eru hluthafar í Arion banka.

       Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra.
       Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær.
       Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær.

       Primera Air var íslenskt flugfélag.
       Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag.

       Villeneuve-Loubet er franskt þorp.

       Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda.
       
       Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse"
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree,
                                                 token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree])
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val)
                          for key, val in trees.items())

    tree = Tree()
    tree.load(tree_string)

    session = SessionShim()
    tree.process(session, entities)

    session.check(("Bygma", "er", "dönsk byggingavörukeðja"))
    session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki"))
    session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður"))
    session.check(
        ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður"))
    session.check(("Primera Air", "var", "íslenskt flugfélag"))
    session.check(("Villeneuve-Loubet", "er", "franskt þorp"))
    session.check(("Valdís", "er", "ísbúð"))
    session.check(("Fosshótel", "var", "rekin með tapi"))
    session.check(("Fosshótel", "er", "stór hótelkeðja"))
    session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði"))
    session.check(("Lax", "er", "stór fiskur af ætt laxfiska"))
    session.check(("Geysir", "er", "gamall goshver"))
    session.check(("Eimskipafélag Íslands hf", "er", "skipafélag"))
    session.check(("Origo", "er", "fyrirtæki"))
    session.check(("AirBerlin", "er", "flugfélag"))

    assert session.is_empty()
コード例 #4
0
ファイル: article.py プロジェクト: haukurb/Reynir
    def _parse(self, enclosing_session=None, verbose=False):
        """ Parse the article content to yield parse trees and annotated token list """
        with SessionContext(enclosing_session) as session:

            # Convert the content soup to a token iterable (generator)
            toklist = Fetcher.tokenize_html(self._url, self._html, session)

            bp = self.get_parser()
            ip = IncrementalParser(bp, toklist, verbose=verbose)

            # List of paragraphs containing a list of sentences containing token lists
            # for sentences in string dump format (1-based paragraph and sentence indices)
            pgs = []

            # Dict of parse trees in string dump format,
            # stored by sentence index (1-based)
            trees = OrderedDict()

            # Word stem dictionary, indexed by (stem, cat)
            words = defaultdict(int)
            num_sent = 0

            for p in ip.paragraphs():

                pgs.append([])

                for sent in p.sentences():

                    num_sent += 1
                    num_tokens = len(sent)

                    # We don't attempt to parse very long sentences (>100 tokens)
                    # since they are memory intensive (>16 GB) and may take
                    # minutest to process
                    if num_tokens <= MAX_SENTENCE_TOKENS and sent.parse():
                        # Obtain a text representation of the parse tree
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, sent.tree, words)
                        # Create a verbose text representation of
                        # the highest scoring parse tree
                        tree = ParseForestDumper.dump_forest(
                            sent.tree, token_dicts=token_dicts)
                        # Add information about the sentence tree's score
                        # and the number of tokens
                        trees[num_sent] = "\n".join([
                            "C{0}".format(sent.score),
                            "L{0}".format(num_tokens), tree
                        ])
                    else:
                        # Error, sentence too long or no parse:
                        # add an error index entry for this sentence
                        if num_tokens > MAX_SENTENCE_TOKENS:
                            # Set the error index at the first
                            # token outside the maximum limit
                            eix = MAX_SENTENCE_TOKENS
                        else:
                            eix = sent.err_index
                        token_dicts = TreeUtility.dump_tokens(
                            sent.tokens, None, None, eix)
                        trees[num_sent] = "E{0}".format(eix)

                    pgs[-1].append(token_dicts)

            # parse_time = ip.parse_time

            self._parsed = datetime.utcnow()
            self._parser_version = bp.version
            self._num_tokens = ip.num_tokens
            self._num_sentences = ip.num_sentences
            self._num_parsed = ip.num_parsed
            self._ambiguity = ip.ambiguity

            # Make one big JSON string for the paragraphs, sentences and tokens
            self._raw_tokens = pgs
            self._tokens = json.dumps(pgs,
                                      separators=(",", ":"),
                                      ensure_ascii=False)

            # Keep the bag of words (stem, category, count for each word)
            self._words = words

            # Create a tree representation string out of all the accumulated parse trees
            self._tree = "".join("S{0}\n{1}\n".format(key, val)
                                 for key, val in trees.items())