def recognize_entities(self, stream): """ Recognize named entities using the nertokenizer module, but construct tokens using the Correct_TOK class from reynir_correct """ return nertokenizer.recognize_entities( stream, token_ctor=reynir_correct.Correct_TOK )
def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = mark_paragraphs(text) # Tokenize the result token_stream = tokenize(text) toklist = list( recognize_entities(token_stream, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from queries.builtin import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = mark_paragraphs(text) # Tokenize the result token_stream = tokenize(text) toklist = list(recognize_entities(token_stream, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from query import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def parse(self, result): """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query.strip() if not q: self.set_error("E_EMPTY_QUERY") return False toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()) toklist = list(recognize_entities(toklist, enclosing_session=self._session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: actual_q = actual_q[0].upper() + actual_q[1:] if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] print(tree_string) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True
def recognize_entities(self, stream: Iterator[Tok]) -> Iterator[Tok]: """ Recognize named entities using the nertokenizer module, but construct tokens using the Correct_TOK class from reynir_correct """ if CI_RUN: # Skip the recognize_entities pass if we are running in a # continuous integration environment, where we have no database return stream else: return nertokenizer.recognize_entities( stream, token_ctor=reynir_correct.Correct_TOK)
def to_tokens(soup, enclosing_session=None): """ Convert an HTML soup root into a parsable token stream """ # Extract the text content of the HTML into a list tlist = Fetcher.TextList() Fetcher.extract_text(soup, tlist) text = tlist.result() # Tokenize the resulting text, returning a generator token_stream = tokenize(text) return recognize_entities(token_stream, enclosing_session=enclosing_session)
def query_api(version=1): """ Respond to a query string """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") if request.method == "GET": q = request.args.get("q", "") else: q = request.form.get("q", "") q = q.strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = bool_from_request(request, "autouppercase", True) result = dict() ql = q.lower() if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES: result["valid"] = True result["qtype"] = "Special" result["q"] = q if ql in _SPECIAL_QUERIES: result["response"] = _SPECIAL_QUERIES[ql] else: result["response"] = _SPECIAL_QUERIES[ql + "?"] else: with SessionContext(commit=True) as session: toklist = tokenize( q, auto_uppercase=q.islower() if auto_uppercase else False ) toklist = list(recognize_entities(toklist, enclosing_session=session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) # if Settings.DEBUG: # # Log the query string as seen by the parser # print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query try: is_query = process_query(session, toklist, result) except: is_query = False result["valid"] = is_query result["q"] = actual_q return better_jsonify(**result)