Python tokenize Examples, reynir.tokenize Python Examples

Example #1

0

Show file

File: treeutil.py Project: vthorsteinsson/Reynir

    def _process_text(parser, session, text, all_names, xform):
        """ Low-level utility function to parse text and return the result of
            a transformation function (xform) for each sentence.
            Set all_names = True to get a comprehensive name register.
            Set all_names = False to get a simple name register.
            Set all_names = None to get no name register. """
        t0 = time.time()
        # Demarcate paragraphs in the input
        text = mark_paragraphs(text)
        # Tokenize the result
        token_stream = tokenize(text)
        toklist = list(recognize_entities(token_stream, enclosing_session=session))
        t1 = time.time()
        pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform)

        if all_names is None:
            register = None
        else:
            from query import create_name_register
            register = create_name_register(toklist, session, all_names=all_names)

        t2 = time.time()
        stats["tok_time"] = t1 - t0
        stats["parse_time"] = t2 - t1
        stats["total_time"] = t2 - t0
        return (pgs, stats, register)

Example #2

0

Show file

File: treeutil.py Project: thorunna/Greynir

    def _process_text(parser, session, text, all_names, xform):
        """ Low-level utility function to parse text and return the result of
            a transformation function (xform) for each sentence.
            Set all_names = True to get a comprehensive name register.
            Set all_names = False to get a simple name register.
            Set all_names = None to get no name register. """
        t0 = time.time()
        # Demarcate paragraphs in the input
        text = mark_paragraphs(text)
        # Tokenize the result
        token_stream = tokenize(text)
        toklist = list(
            recognize_entities(token_stream, enclosing_session=session))
        t1 = time.time()
        pgs, stats = TreeUtility._process_toklist(parser, session, toklist,
                                                  xform)

        if all_names is None:
            register = None
        else:
            from queries.builtin import create_name_register

            register = create_name_register(toklist,
                                            session,
                                            all_names=all_names)

        t2 = time.time()
        stats["tok_time"] = t1 - t0
        stats["parse_time"] = t2 - t1
        stats["total_time"] = t2 - t0
        return (pgs, stats, register)

Example #3

0

Show file

File: test_processors.py Project: thorunna/Greynir

    def make_tree(text: str) -> Tree:
        toklist = tokenize(text)
        fp = Fast_Parser(verbose=False)
        ip = IncrementalParser(fp, toklist, verbose=False)
        # Dict of parse trees in string dump format,
        # stored by sentence index (1-based)
        trees = OrderedDict()
        num_sent = 0
        for p in ip.paragraphs():
            for sent in p.sentences():
                num_sent += 1
                num_tokens = len(sent)
                assert sent.parse(), "Sentence does not parse: " + sent.text
                # Obtain a text representation of the parse tree
                token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
                # Create a verbose text representation of
                # the highest scoring parse tree
                tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
                # Add information about the sentence tree's score
                # and the number of tokens
                trees[num_sent] = "\n".join(
                    ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
                )
        # Create a tree representation string out of
        # all the accumulated parse trees
        tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

        tree = Tree()
        tree.load(tree_string)
        return tree

Example #4

0

Show file

    def parse(self, result):
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query.strip()
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())
        toklist = list(recognize_entities(toklist, enclosing_session=self._session))

        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            actual_q = actual_q[0].upper() + actual_q[1:]
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        print(tree_string)
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True

Example #5

0

Show file

 def gen_tuples(self) -> Iterable[LemmaTuple]:
     """ Generate (lemma, cat) tuples from the document by
         tokenizing it into paragraphs and sentences, then
         lemmatizing each sentence """
     tokens = tokenize(self._text)
     for pg in paragraphs(tokens):
         for _, sent in pg:
             yield from self.lemmatize(sent)

Example #6

0

Show file

File: fetcher.py Project: vthorsteinsson/Reynir

    def to_tokens(soup, enclosing_session=None):
        """ Convert an HTML soup root into a parsable token stream """

        # Extract the text content of the HTML into a list
        tlist = Fetcher.TextList()
        Fetcher.extract_text(soup, tlist)
        text = tlist.result()

        # Tokenize the resulting text, returning a generator
        token_stream = tokenize(text)
        return recognize_entities(token_stream, enclosing_session=enclosing_session)

Example #7

0

Show file

File: fetcher.py Project: hinriksnaer/Greynir

    def to_tokens(soup, enclosing_session=None):
        """ Convert an HTML soup root into a parsable token stream """

        # Extract the text content of the HTML into a list
        tlist = Fetcher.TextList()
        Fetcher.extract_text(soup, tlist)
        text = tlist.result()

        # Tokenize the resulting text, returning a generator
        token_stream = tokenize(text)
        return recognize_entities(token_stream,
                                  enclosing_session=enclosing_session)

Example #8

0

Show file

File: api.py Project: vthorsteinsson/Reynir

def query_api(version=1):
    """ Respond to a query string """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    if request.method == "GET":
        q = request.args.get("q", "")
    else:
        q = request.form.get("q", "")
    q = q.strip()[0:_MAX_QUERY_LENGTH]

    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = bool_from_request(request, "autouppercase", True)
    result = dict()
    ql = q.lower()

    if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES:
        result["valid"] = True
        result["qtype"] = "Special"
        result["q"] = q
        if ql in _SPECIAL_QUERIES:
            result["response"] = _SPECIAL_QUERIES[ql]
        else:
            result["response"] = _SPECIAL_QUERIES[ql + "?"]
    else:
        with SessionContext(commit=True) as session:

            toklist = tokenize(
                q, auto_uppercase=q.islower() if auto_uppercase else False
            )
            toklist = list(recognize_entities(toklist, enclosing_session=session))
            actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))

            # if Settings.DEBUG:
            #     # Log the query string as seen by the parser
            #     print("Query is: '{0}'".format(actual_q))

            # Try to parse and process as a query
            try:
                is_query = process_query(session, toklist, result)
            except:
                is_query = False

        result["valid"] = is_query
        result["q"] = actual_q

    return better_jsonify(**result)

Example #9

0

Show file

def _make_tree(text: str) -> Tree:
    """Tokenize and parse text, create tree representation string
    from all the parse trees, return Tree object and token JSON."""
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)

    pgs = []
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        pgs.append([])
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            assert sent.tree is not None
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
            )
            pgs[-1].append(token_dicts)
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
    tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False)

    tree = Tree()
    tree.load(tree_string)
    return tree, tokens_json

Example #10

0

Show file

File: spelling.py Project: mideind/GreynirCorrect

 def correct_text(self,
                  text: StringIterable,
                  *,
                  only_rare: bool = False) -> str:
     """Attempt to correct all words within a text, returning the corrected text.
     If only_rare is True, correction is only attempted on rare words."""
     result: List[str] = []
     look_back = -MAX_ORDER + 1
     for token in tokenize(text):
         if token.kind == TOK.WORD:
             if only_rare and not self.is_rare(token.txt):
                 # The word is not rare, so we don't attempt correction
                 result.append(token.txt)
             else:
                 # Correct the word and return the result
                 result.append(
                     self.correct(token.txt,
                                  context=tuple(result[look_back:])))
         elif token.txt:
             result.append(token.txt)
         elif token.kind in {TOK.S_BEGIN, TOK.S_END}:
             result.append("")
     return correct_spaces(" ".join(result))

Example #11

0

Show file

File: test_processors.py Project: Loknar/Greynir

def test_entities():
    text = """

       Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota.

       Danska byggingavörukeðjan Bygma hefur keypt íslenska
       verslunarfyrirtækið Húsasmiðjuna.

       Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs
       eru hluthafar í Arion banka.

       Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra.
       Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær.
       Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær.

       Primera Air var íslenskt flugfélag.
       Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag.

       Villeneuve-Loubet er franskt þorp.

       Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda.
       
       Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse"
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree,
                                                 token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree])
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val)
                          for key, val in trees.items())

    tree = Tree()
    tree.load(tree_string)

    session = SessionShim()
    tree.process(session, entities)

    session.check(("Bygma", "er", "dönsk byggingavörukeðja"))
    session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki"))
    session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður"))
    session.check(
        ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður"))
    session.check(("Primera Air", "var", "íslenskt flugfélag"))
    session.check(("Villeneuve-Loubet", "er", "franskt þorp"))
    session.check(("Valdís", "er", "ísbúð"))
    session.check(("Fosshótel", "var", "rekin með tapi"))
    session.check(("Fosshótel", "er", "stór hótelkeðja"))
    session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði"))
    session.check(("Lax", "er", "stór fiskur af ætt laxfiska"))
    session.check(("Geysir", "er", "gamall goshver"))
    session.check(("Eimskipafélag Íslands hf", "er", "skipafélag"))
    session.check(("Origo", "er", "fyrirtæki"))
    session.check(("AirBerlin", "er", "flugfélag"))

    assert session.is_empty()

Example #12

0

Show file

File: postagger.py Project: vthorsteinsson/Reynir

    def tag(self, toklist_or_text):
        """ Assign IFD tags to the given toklist, putting the tag in the
            "i" field of each non-punctuation token. If a string is passed,
            tokenize it first. Return the toklist so modified. """
        if isinstance(toklist_or_text, str):
            toklist = list(tokenize(toklist_or_text))
        else:
            toklist = list(toklist_or_text)

        tagsets = []
        for t in toklist:
            if not t.txt:
                continue
            taglist = self.tag_single_token(t)
            if taglist:
            #    display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist)
            #    print("{0:20}: {1}".format(t.txt, display))
                tagsets.append(taglist)

        _, tags = self._most_likely(tagsets)

        if not tags:
            return []

        def gen_tokens():
            """ Generate a Greynir token sequence from a tagging result """
            ix = 0
            for t in toklist:
                if not t.txt:
                    continue
                # The code below should correspond to TreeUtility._describe_token()
                d = dict(x = t.txt)
                if t.kind == TOK.WORD:
                    # set d["m"] to the meaning
                    pass
                else:
                    d["k"] = t.kind
                if t.val is not None and t.kind not in { TOK.WORD, TOK.ENTITY, TOK.PUNCTUATION }:
                    # For tokens except words, entities and punctuation, include the val field
                    if t.kind == TOK.PERSON:
                        d["v"], d["g"] = TreeUtility.choose_full_name(t.val, case = None, gender = None)
                    else:
                        d["v"] = t.val
                if t.kind in { TOK.WORD, TOK.ENTITY, TOK.PERSON, TOK.NUMBER, TOK.YEAR, TOK.ORDINAL, TOK.PERCENT }:
                    d["i"] = tags[ix]
                    ix += 1
                if t.kind == TOK.WORD and " " in d["x"]:
                    # Some kind of phrase: split it
                    xlist = d["x"].split()
                    for x in xlist:
                        d["x"] = x
                        if x == "og":
                            # Probably intermediate word: fjármála- og efnahagsráðherra
                            yield dict(x = "og", i = "c")
                        else:
                            yield d.copy()
                elif t.kind == TOK.PERSON:
                    # Split person tokens into subtokens for each name component
                    xlist = d["x"].split() # Name as it originally appeared
                    slist = d["v"].split() # Stem (nominal) form of name
                    # xlist may be shorter than slist, but that is OK
                    for x, s in zip(xlist, slist):
                        d["x"] = x
                        d["v"] = s
                        yield d.copy()
                elif t.kind == TOK.ENTITY:
                    # Split entity tokens into subtokens for each name component
                    xlist = d["x"].split() # Name as it originally appeared
                    for x in xlist:
                        d["x"] = x
                        yield d.copy()
                # !!! TBD: Tokens such as dates, amounts and currencies
                # !!! should be split here into multiple subtokens
                else:
                    yield d

        return [ d for d in gen_tokens() ]

Example #13

0

Show file

File: postagger.py Project: thorunna/Greynir

    def tag(self, toklist_or_text):
        """ Assign IFD tags to the given toklist, putting the tag in the
            "i" field of each non-punctuation token. If a string is passed,
            tokenize it first. Return the toklist so modified. """
        if isinstance(toklist_or_text, str):
            toklist = list(tokenize(toklist_or_text))
        else:
            toklist = list(toklist_or_text)

        tagsets = []
        for t in toklist:
            if not t.txt:
                continue
            taglist = self.tag_single_token(t)
            if taglist:
                #    display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist)
                #    print("{0:20}: {1}".format(t.txt, display))
                tagsets.append(taglist)

        _, tags = self._most_likely(tagsets)

        if not tags:
            return []

        def gen_tokens():
            """ Generate a Greynir token sequence from a tagging result """
            ix = 0
            for t in toklist:
                if not t.txt:
                    continue
                # The code below should correspond to TreeUtility._describe_token()
                d = dict(x=t.txt)
                if t.kind == TOK.WORD:
                    # set d["m"] to the meaning
                    pass
                else:
                    d["k"] = t.kind
                if t.val is not None and t.kind not in {
                        TOK.WORD,
                        TOK.ENTITY,
                        TOK.PUNCTUATION,
                }:
                    # For tokens except words, entities and punctuation, include the val field
                    if t.kind == TOK.PERSON:
                        d["v"], d["g"] = TreeUtility.choose_full_name(
                            t.val, case=None, gender=None)
                    else:
                        d["v"] = t.val
                if t.kind in {
                        TOK.WORD,
                        TOK.ENTITY,
                        TOK.PERSON,
                        TOK.NUMBER,
                        TOK.YEAR,
                        TOK.ORDINAL,
                        TOK.PERCENT,
                }:
                    d["i"] = tags[ix]
                    ix += 1
                if t.kind == TOK.WORD and " " in d["x"]:
                    # Some kind of phrase: split it
                    xlist = d["x"].split()
                    for x in xlist:
                        d["x"] = x
                        if x == "og":
                            # Probably intermediate word: fjármála- og efnahagsráðherra
                            yield dict(x="og", i="c")
                        else:
                            yield d.copy()
                elif t.kind == TOK.PERSON:
                    # Split person tokens into subtokens for each name component
                    xlist = d["x"].split()  # Name as it originally appeared
                    slist = d["v"].split()  # Stem (nominal) form of name
                    # xlist may be shorter than slist, but that is OK
                    for x, s in zip(xlist, slist):
                        d["x"] = x
                        d["v"] = s
                        yield d.copy()
                elif t.kind == TOK.ENTITY:
                    # Split entity tokens into subtokens for each name component
                    xlist = d["x"].split()  # Name as it originally appeared
                    for x in xlist:
                        d["x"] = x
                        yield d.copy()
                # !!! TBD: Tokens such as dates, amounts and currencies
                # !!! should be split here into multiple subtokens
                else:
                    yield d

        return [d for d in gen_tokens()]

Example #14

0

Show file

File: query.py Project: sultur/Greynir

    def parse(self, result: ResponseDict) -> bool:
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        # Tokenize and auto-capitalize the query string
        toklist = list(
            tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()))

        actual_q = self._query_string_from_toklist(toklist)

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        # TODO: We might want to re-tokenize the actual_q string with
        # auto_uppercase=False, since we may have fixed capitalization
        # errors in _query_string_from_toklist()

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        try:
            parse_result, trees = Query._parse(toklist)
        except ParseError:
            self.set_error("E_PARSE_ERROR")
            return False

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        if Settings.DEBUG:
            print(tree_string)
        self._tree = QueryTree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True