コード例 #1
0
 def recognize_entities(self, stream):
     """ Recognize named entities using the nertokenizer module,
         but construct tokens using the Correct_TOK class from
         reynir_correct """
     return nertokenizer.recognize_entities(
         stream, token_ctor=reynir_correct.Correct_TOK
     )
コード例 #2
0
ファイル: treeutil.py プロジェクト: thorunna/Greynir
    def _process_text(parser, session, text, all_names, xform):
        """ Low-level utility function to parse text and return the result of
            a transformation function (xform) for each sentence.
            Set all_names = True to get a comprehensive name register.
            Set all_names = False to get a simple name register.
            Set all_names = None to get no name register. """
        t0 = time.time()
        # Demarcate paragraphs in the input
        text = mark_paragraphs(text)
        # Tokenize the result
        token_stream = tokenize(text)
        toklist = list(
            recognize_entities(token_stream, enclosing_session=session))
        t1 = time.time()
        pgs, stats = TreeUtility._process_toklist(parser, session, toklist,
                                                  xform)

        if all_names is None:
            register = None
        else:
            from queries.builtin import create_name_register

            register = create_name_register(toklist,
                                            session,
                                            all_names=all_names)

        t2 = time.time()
        stats["tok_time"] = t1 - t0
        stats["parse_time"] = t2 - t1
        stats["total_time"] = t2 - t0
        return (pgs, stats, register)
コード例 #3
0
ファイル: treeutil.py プロジェクト: vthorsteinsson/Reynir
    def _process_text(parser, session, text, all_names, xform):
        """ Low-level utility function to parse text and return the result of
            a transformation function (xform) for each sentence.
            Set all_names = True to get a comprehensive name register.
            Set all_names = False to get a simple name register.
            Set all_names = None to get no name register. """
        t0 = time.time()
        # Demarcate paragraphs in the input
        text = mark_paragraphs(text)
        # Tokenize the result
        token_stream = tokenize(text)
        toklist = list(recognize_entities(token_stream, enclosing_session=session))
        t1 = time.time()
        pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform)

        if all_names is None:
            register = None
        else:
            from query import create_name_register
            register = create_name_register(toklist, session, all_names=all_names)

        t2 = time.time()
        stats["tok_time"] = t1 - t0
        stats["parse_time"] = t2 - t1
        stats["total_time"] = t2 - t0
        return (pgs, stats, register)
コード例 #4
0
    def parse(self, result):
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query.strip()
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())
        toklist = list(recognize_entities(toklist, enclosing_session=self._session))

        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            actual_q = actual_q[0].upper() + actual_q[1:]
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        print(tree_string)
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True
コード例 #5
0
ファイル: correct.py プロジェクト: mideind/Yfirlestur
 def recognize_entities(self, stream: Iterator[Tok]) -> Iterator[Tok]:
     """ Recognize named entities using the nertokenizer module,
         but construct tokens using the Correct_TOK class from
         reynir_correct """
     if CI_RUN:
         # Skip the recognize_entities pass if we are running in a
         # continuous integration environment, where we have no database
         return stream
     else:
         return nertokenizer.recognize_entities(
             stream, token_ctor=reynir_correct.Correct_TOK)
コード例 #6
0
ファイル: fetcher.py プロジェクト: vthorsteinsson/Reynir
    def to_tokens(soup, enclosing_session=None):
        """ Convert an HTML soup root into a parsable token stream """

        # Extract the text content of the HTML into a list
        tlist = Fetcher.TextList()
        Fetcher.extract_text(soup, tlist)
        text = tlist.result()

        # Tokenize the resulting text, returning a generator
        token_stream = tokenize(text)
        return recognize_entities(token_stream, enclosing_session=enclosing_session)
コード例 #7
0
ファイル: fetcher.py プロジェクト: hinriksnaer/Greynir
    def to_tokens(soup, enclosing_session=None):
        """ Convert an HTML soup root into a parsable token stream """

        # Extract the text content of the HTML into a list
        tlist = Fetcher.TextList()
        Fetcher.extract_text(soup, tlist)
        text = tlist.result()

        # Tokenize the resulting text, returning a generator
        token_stream = tokenize(text)
        return recognize_entities(token_stream,
                                  enclosing_session=enclosing_session)
コード例 #8
0
ファイル: api.py プロジェクト: vthorsteinsson/Reynir
def query_api(version=1):
    """ Respond to a query string """

    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    if request.method == "GET":
        q = request.args.get("q", "")
    else:
        q = request.form.get("q", "")
    q = q.strip()[0:_MAX_QUERY_LENGTH]

    # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON
    auto_uppercase = bool_from_request(request, "autouppercase", True)
    result = dict()
    ql = q.lower()

    if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES:
        result["valid"] = True
        result["qtype"] = "Special"
        result["q"] = q
        if ql in _SPECIAL_QUERIES:
            result["response"] = _SPECIAL_QUERIES[ql]
        else:
            result["response"] = _SPECIAL_QUERIES[ql + "?"]
    else:
        with SessionContext(commit=True) as session:

            toklist = tokenize(
                q, auto_uppercase=q.islower() if auto_uppercase else False
            )
            toklist = list(recognize_entities(toklist, enclosing_session=session))
            actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))

            # if Settings.DEBUG:
            #     # Log the query string as seen by the parser
            #     print("Query is: '{0}'".format(actual_q))

            # Try to parse and process as a query
            try:
                is_query = process_query(session, toklist, result)
            except:
                is_query = False

        result["valid"] = is_query
        result["q"] = actual_q

    return better_jsonify(**result)