def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = mark_paragraphs(text) # Tokenize the result token_stream = tokenize(text) toklist = list(recognize_entities(token_stream, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from query import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def _process_text(parser, session, text, all_names, xform): """ Low-level utility function to parse text and return the result of a transformation function (xform) for each sentence. Set all_names = True to get a comprehensive name register. Set all_names = False to get a simple name register. Set all_names = None to get no name register. """ t0 = time.time() # Demarcate paragraphs in the input text = mark_paragraphs(text) # Tokenize the result token_stream = tokenize(text) toklist = list( recognize_entities(token_stream, enclosing_session=session)) t1 = time.time() pgs, stats = TreeUtility._process_toklist(parser, session, toklist, xform) if all_names is None: register = None else: from queries.builtin import create_name_register register = create_name_register(toklist, session, all_names=all_names) t2 = time.time() stats["tok_time"] = t1 - t0 stats["parse_time"] = t2 - t1 stats["total_time"] = t2 - t0 return (pgs, stats, register)
def make_tree(text: str) -> Tree: toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) return tree
def parse(self, result): """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query.strip() if not q: self.set_error("E_EMPTY_QUERY") return False toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()) toklist = list(recognize_entities(toklist, enclosing_session=self._session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: actual_q = actual_q[0].upper() + actual_q[1:] if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] print(tree_string) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True
def gen_tuples(self) -> Iterable[LemmaTuple]: """ Generate (lemma, cat) tuples from the document by tokenizing it into paragraphs and sentences, then lemmatizing each sentence """ tokens = tokenize(self._text) for pg in paragraphs(tokens): for _, sent in pg: yield from self.lemmatize(sent)
def to_tokens(soup, enclosing_session=None): """ Convert an HTML soup root into a parsable token stream """ # Extract the text content of the HTML into a list tlist = Fetcher.TextList() Fetcher.extract_text(soup, tlist) text = tlist.result() # Tokenize the resulting text, returning a generator token_stream = tokenize(text) return recognize_entities(token_stream, enclosing_session=enclosing_session)
def query_api(version=1): """ Respond to a query string """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") if request.method == "GET": q = request.args.get("q", "") else: q = request.form.get("q", "") q = q.strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = bool_from_request(request, "autouppercase", True) result = dict() ql = q.lower() if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES: result["valid"] = True result["qtype"] = "Special" result["q"] = q if ql in _SPECIAL_QUERIES: result["response"] = _SPECIAL_QUERIES[ql] else: result["response"] = _SPECIAL_QUERIES[ql + "?"] else: with SessionContext(commit=True) as session: toklist = tokenize( q, auto_uppercase=q.islower() if auto_uppercase else False ) toklist = list(recognize_entities(toklist, enclosing_session=session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) # if Settings.DEBUG: # # Log the query string as seen by the parser # print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query try: is_query = process_query(session, toklist, result) except: is_query = False result["valid"] = is_query result["q"] = actual_q return better_jsonify(**result)
def _make_tree(text: str) -> Tree: """Tokenize and parse text, create tree representation string from all the parse trees, return Tree object and token JSON.""" toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree assert sent.tree is not None tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) pgs[-1].append(token_dicts) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) tree = Tree() tree.load(tree_string) return tree, tokens_json
def correct_text(self, text: StringIterable, *, only_rare: bool = False) -> str: """Attempt to correct all words within a text, returning the corrected text. If only_rare is True, correction is only attempted on rare words.""" result: List[str] = [] look_back = -MAX_ORDER + 1 for token in tokenize(text): if token.kind == TOK.WORD: if only_rare and not self.is_rare(token.txt): # The word is not rare, so we don't attempt correction result.append(token.txt) else: # Correct the word and return the result result.append( self.correct(token.txt, context=tuple(result[look_back:]))) elif token.txt: result.append(token.txt) elif token.kind in {TOK.S_BEGIN, TOK.S_END}: result.append("") return correct_spaces(" ".join(result))
def test_entities(): text = """ Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota. Danska byggingavörukeðjan Bygma hefur keypt íslenska verslunarfyrirtækið Húsasmiðjuna. Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs eru hluthafar í Arion banka. Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra. Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám. Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám. Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær. Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær. Primera Air var íslenskt flugfélag. Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag. Villeneuve-Loubet er franskt þorp. Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda. Í miðbæ Reykjavíkur er herrafataverslunin Geysir. Mér er sagt að Geysir sé hættur að gjósa. Geysir er hættur að gjósa. Geysir er gamall goshver. Fyrirtækið Apple-búðin selur Apple Mac tölvur. Fyrirtækið Origo selur IBM tölvur. Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf. """ toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse" # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) session = SessionShim() tree.process(session, entities) session.check(("Bygma", "er", "dönsk byggingavörukeðja")) session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki")) session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður")) session.check( ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður")) session.check(("Primera Air", "var", "íslenskt flugfélag")) session.check(("Villeneuve-Loubet", "er", "franskt þorp")) session.check(("Valdís", "er", "ísbúð")) session.check(("Fosshótel", "var", "rekin með tapi")) session.check(("Fosshótel", "er", "stór hótelkeðja")) session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði")) session.check(("Lax", "er", "stór fiskur af ætt laxfiska")) session.check(("Geysir", "er", "gamall goshver")) session.check(("Eimskipafélag Íslands hf", "er", "skipafélag")) session.check(("Origo", "er", "fyrirtæki")) session.check(("AirBerlin", "er", "flugfélag")) assert session.is_empty()
def tag(self, toklist_or_text): """ Assign IFD tags to the given toklist, putting the tag in the "i" field of each non-punctuation token. If a string is passed, tokenize it first. Return the toklist so modified. """ if isinstance(toklist_or_text, str): toklist = list(tokenize(toklist_or_text)) else: toklist = list(toklist_or_text) tagsets = [] for t in toklist: if not t.txt: continue taglist = self.tag_single_token(t) if taglist: # display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist) # print("{0:20}: {1}".format(t.txt, display)) tagsets.append(taglist) _, tags = self._most_likely(tagsets) if not tags: return [] def gen_tokens(): """ Generate a Greynir token sequence from a tagging result """ ix = 0 for t in toklist: if not t.txt: continue # The code below should correspond to TreeUtility._describe_token() d = dict(x = t.txt) if t.kind == TOK.WORD: # set d["m"] to the meaning pass else: d["k"] = t.kind if t.val is not None and t.kind not in { TOK.WORD, TOK.ENTITY, TOK.PUNCTUATION }: # For tokens except words, entities and punctuation, include the val field if t.kind == TOK.PERSON: d["v"], d["g"] = TreeUtility.choose_full_name(t.val, case = None, gender = None) else: d["v"] = t.val if t.kind in { TOK.WORD, TOK.ENTITY, TOK.PERSON, TOK.NUMBER, TOK.YEAR, TOK.ORDINAL, TOK.PERCENT }: d["i"] = tags[ix] ix += 1 if t.kind == TOK.WORD and " " in d["x"]: # Some kind of phrase: split it xlist = d["x"].split() for x in xlist: d["x"] = x if x == "og": # Probably intermediate word: fjármála- og efnahagsráðherra yield dict(x = "og", i = "c") else: yield d.copy() elif t.kind == TOK.PERSON: # Split person tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared slist = d["v"].split() # Stem (nominal) form of name # xlist may be shorter than slist, but that is OK for x, s in zip(xlist, slist): d["x"] = x d["v"] = s yield d.copy() elif t.kind == TOK.ENTITY: # Split entity tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared for x in xlist: d["x"] = x yield d.copy() # !!! TBD: Tokens such as dates, amounts and currencies # !!! should be split here into multiple subtokens else: yield d return [ d for d in gen_tokens() ]
def tag(self, toklist_or_text): """ Assign IFD tags to the given toklist, putting the tag in the "i" field of each non-punctuation token. If a string is passed, tokenize it first. Return the toklist so modified. """ if isinstance(toklist_or_text, str): toklist = list(tokenize(toklist_or_text)) else: toklist = list(toklist_or_text) tagsets = [] for t in toklist: if not t.txt: continue taglist = self.tag_single_token(t) if taglist: # display = " | ".join("{0} {1:.2f}".format(w, p) for w, p in taglist) # print("{0:20}: {1}".format(t.txt, display)) tagsets.append(taglist) _, tags = self._most_likely(tagsets) if not tags: return [] def gen_tokens(): """ Generate a Greynir token sequence from a tagging result """ ix = 0 for t in toklist: if not t.txt: continue # The code below should correspond to TreeUtility._describe_token() d = dict(x=t.txt) if t.kind == TOK.WORD: # set d["m"] to the meaning pass else: d["k"] = t.kind if t.val is not None and t.kind not in { TOK.WORD, TOK.ENTITY, TOK.PUNCTUATION, }: # For tokens except words, entities and punctuation, include the val field if t.kind == TOK.PERSON: d["v"], d["g"] = TreeUtility.choose_full_name( t.val, case=None, gender=None) else: d["v"] = t.val if t.kind in { TOK.WORD, TOK.ENTITY, TOK.PERSON, TOK.NUMBER, TOK.YEAR, TOK.ORDINAL, TOK.PERCENT, }: d["i"] = tags[ix] ix += 1 if t.kind == TOK.WORD and " " in d["x"]: # Some kind of phrase: split it xlist = d["x"].split() for x in xlist: d["x"] = x if x == "og": # Probably intermediate word: fjármála- og efnahagsráðherra yield dict(x="og", i="c") else: yield d.copy() elif t.kind == TOK.PERSON: # Split person tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared slist = d["v"].split() # Stem (nominal) form of name # xlist may be shorter than slist, but that is OK for x, s in zip(xlist, slist): d["x"] = x d["v"] = s yield d.copy() elif t.kind == TOK.ENTITY: # Split entity tokens into subtokens for each name component xlist = d["x"].split() # Name as it originally appeared for x in xlist: d["x"] = x yield d.copy() # !!! TBD: Tokens such as dates, amounts and currencies # !!! should be split here into multiple subtokens else: yield d return [d for d in gen_tokens()]
def parse(self, result: ResponseDict) -> bool: """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query if not q: self.set_error("E_EMPTY_QUERY") return False # Tokenize and auto-capitalize the query string toklist = list( tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())) actual_q = self._query_string_from_toklist(toklist) # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) # TODO: We might want to re-tokenize the actual_q string with # auto_uppercase=False, since we may have fixed capitalization # errors in _query_string_from_toklist() if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) try: parse_result, trees = Query._parse(toklist) except ParseError: self.set_error("E_PARSE_ERROR") return False if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] if Settings.DEBUG: print(tree_string) self._tree = QueryTree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True