def query_which_route(query, session, result): """ Which routes stop at a given bus stop """ stop_name = result.stop_name # 'Einarsnes', 'Fiskislóð'... if stop_name in {"þar", "þangað"}: # Referring to a bus stop mentioned earlier ctx = query.fetch_context() if ctx and "bus_stop" in ctx: stop_name = ctx["bus_stop"] result.qkey = stop_name else: answer = voice_answer = "Ég veit ekki við hvaða stað þú átt." response = dict(answer=answer) return response, answer, voice_answer bus_noun = result.bus_noun # 'strætó', 'vagn', 'leið'... stops = straeto.BusStop.named(stop_name, fuzzy=True) if not stops: a = [stop_name, "þekkist ekki."] va = [ "Ég", "þekki", "ekki", "biðstöðina", stop_name.capitalize(), ] else: routes = set() if query.location: straeto.BusStop.sort_by_proximity(stops, query.location) stop = stops[0] for route_id in stop.visits.keys(): number = straeto.BusRoute.lookup(route_id).number routes.add(number) va = [bus_noun, "númer"] a = va[:] nroutes = len(routes) cnt = 0 for rn in sorted(routes, key=lambda t: int(t)): if cnt: sep = "og" if cnt + 1 == nroutes else "," va.append(sep) a.append(sep) # We convert inflectable numbers to their text equivalents # since the speech engine can't be relied upon to get the # inflection of numbers right va.append(numbers_to_neutral(rn)) a.append(rn) cnt += 1 tail = ["stoppar á", to_dative(stop.name)] va.extend(tail) a.extend(tail) # Store a location coordinate and a bus stop name in the context query.set_context({"location": stop.location, "bus_stop": stop.name}) voice_answer = correct_spaces(" ".join(va) + ".") answer = correct_spaces(" ".join(a)) answer = answer[0].upper() + answer[1:] response = dict(answer=answer) return response, answer, voice_answer
def VillaEndingANA(self, txt: str, variants: str, node: Node) -> AnnotationDict: # 'þingflokkana' á sennilega að vera 'þingflokkanna' # In this case, we need the genitive form # of the token in self._tokens[node.start] tnode = self._terminal_nodes[node.start] suggestion = tnode.genitive_np correct_np = correct_spaces(suggestion) canonical_np = tnode.canonical_np if canonical_np.endswith("ar"): # This might be something like 'landsteinar' which is only plural detail = ("Karlkyns orð sem enda á '-ar' í nefnifalli fleirtölu, " "eins og '{0}', eru rituð " "'{1}' með tveimur n-um í eignarfalli fleirtölu, " "ekki '{2}' með einu n-i.").format( canonical_np, correct_np, txt) else: detail = ("Karlkyns orð sem enda á '-{3}' í nefnifalli eintölu, " "eins og '{0}', eru rituð " "'{1}' með tveimur n-um í eignarfalli fleirtölu, " "ekki '{2}' með einu n-i.").format( canonical_np, correct_np, txt, canonical_np[-2:]) return dict( text="Á sennilega að vera '{0}'".format(correct_np), detail=detail, suggestion=suggestion, )
def parse_tsv_file(file_handle, reorder=True): """ Parse .tsv file of the format: flag, uuid, sentence_index, text, url [, datetime] if the number of sentences in text is not 1 (according to the tokenizer/parser) then they will be merged naively. """ parser = Reynir() filtered = [] for (line_idx, line) in enumerate(file_handle): flags, uuid, idx, text, url, *_ = line.strip().split("\t")[:6] should_export = False if not flags else "1" in flags if not should_export: continue filtered.append( CorpusEntry(flags=flags, uuid=uuid, text=text, index=idx, url=url) ) if reorder: filtered = sorted(filtered, key=lambda e: len(e.text.split(" "))) for entry in filtered: res = parser.parse(correct_spaces(entry.text)) annotrees = [] for idx, sent in enumerate(res["sentences"]): tree = reynir_sentence_to_annotree(sent) annotrees.append(tree) first, *rest = annotrees for tree in rest: first.insert(len(first), tree) id_corpus = "{0}.{1}".format(entry.uuid, entry.index) yield CorpusTree(id_corpus=id_corpus, tree=first, url=entry.url)
def VillaFsMeðFallstjórn(self, txt: str, variants: str, node: Node) -> AnnotationDict: # Forsetningin z á að stýra x-falli en ekki y-falli tnode = self._terminal_nodes[node.start] p = tnode.enclosing_tag("PP") subj = None if p is not None: try: subj = p.NP except AttributeError: pass if subj: preposition = p.P.text suggestion = preposition + " " + self.cast_to_case(variants, subj) correct_np = correct_spaces(suggestion) return dict( text="Á sennilega að vera '{0}'".format(correct_np), detail=("Forsetningin '{0}' stýrir {1}falli.".format( preposition.lower(), CASE_NAMES[variants], )), suggestion=suggestion, ) # In this case, there's no suggested correction return dict(text="Forsetningin '{0}' stýrir {1}falli.".format( txt.split()[0].lower(), CASE_NAMES[variants]), )
def parse(self, result): """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query.strip() if not q: self.set_error("E_EMPTY_QUERY") return False toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()) toklist = list(recognize_entities(toklist, enclosing_session=self._session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: actual_q = actual_q[0].upper() + actual_q[1:] if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] print(tree_string) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True
def query_which_route(query, session, result): """ Which routes stop at a given bus stop """ stop_name = result.stop_name # 'Einarsnes', 'Fiskislóð'... bus_noun = result.bus_noun # 'strætó', 'vagn', 'leið'... stops = straeto.BusStop.named(stop_name, fuzzy=True) if not stops: a = [stop_name, "þekkist ekki."] va = [ "Ég", "þekki", "ekki", "biðstöðina", stop_name.capitalize(), ] else: routes = set() if query.location: straeto.BusStop.sort_by_proximity(stops, query.location) stop = stops[0] for route_id in stop.visits.keys(): number = straeto.BusRoute.lookup(route_id).number routes.add(number) va = [bus_noun, "númer"] a = va[:] nroutes = len(routes) cnt = 0 for rn in sorted(routes, key=lambda t: int(t)): if cnt: sep = "og" if cnt + 1 == nroutes else "," va.append(sep) a.append(sep) # We convert inflectable numbers to their text equivalents # since the speech engine can't be relied upon to get the # inflection of numbers right va.append(NUMBERS_NEUTRAL.get(rn, rn)) a.append(rn) cnt += 1 tail = ["stoppar á", to_dative(stop.name)] va.extend(tail) a.extend(tail) voice_answer = correct_spaces(" ".join(va) + ".") answer = correct_spaces(" ".join(a)) response = dict(answer=answer) return response, answer, voice_answer
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id) .join(Article) .join(Root) .filter(Root.visible) .order_by(desc(Article.timestamp))[ 0 : limit * 2 ] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0] ): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def append_answers(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd """ for p in q: s = correct_spaces(prop_func(p)) ai = dict( domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url, ) rd[s][p.id] = ai # Add to a dict of UUIDs
def annotate_wrong_subject_case(subj_case_abbr, correct_case_abbr): """ Create an annotation that describes a verb having a subject in the wrong case """ wrong_case = CASE_NAMES[subj_case_abbr] # Retrieve the correct case correct_case = CASE_NAMES[correct_case_abbr] # Try to recover the verb's subject subj = self.find_verb_subject(tnode) code = "P_WRONG_CASE_" + subj_case_abbr + "_" + correct_case_abbr personal = "persónuleg" if correct_case_abbr == "nf" else "ópersónuleg" if subj is not None: # We know what the subject is: annotate it start, end = subj.span subj_text = subj.tidy_text suggestion = self.cast_to_case(correct_case_abbr, subj) correct_np = correct_spaces(suggestion) correct_np = emulate_case(correct_np, subj_text) # Skip the annotation if it suggests the same text as the # original one; this can happen if the word forms for two # cases are identical if subj_text != correct_np: self._ann.append( Annotation( start=start, end=end, code=code, text="Á líklega að vera '{0}'".format(correct_np), detail="Sögnin 'að {0}' er {3}. " "Frumlag hennar á að vera " "í {1}falli í stað {2}falls.".format( verb, correct_case, wrong_case, personal), suggest=suggestion, )) else: # We don't seem to find the subject, so just annotate the verb. # In this case, there's no suggested correction. index = node.token.index self._ann.append( Annotation( start=index, end=index, code=code, text="Frumlag sagnarinnar 'að {0}' " "á að vera í {1}falli".format(verb, correct_case), detail="Sögnin 'að {0}' er {3}. " "Frumlag hennar á að vera " "í {1}falli í stað {2}falls.".format( verb, correct_case, wrong_case, personal), ))
def _query_string_from_toklist(toklist: Iterable[Tok]) -> str: """ Re-create a query string from an auto-capitalized token list """ actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: # Fix stuff that the auto-capitalization tends to get wrong, # such as 'í Dag' for wrong, correct in _CAPITALIZATION_REPLACEMENTS: actual_q = actual_q.replace(wrong, correct) # Capitalize the first letter of the query actual_q = actual_q[0].upper() + actual_q[1:] # Terminate the query with a question mark, # if not otherwise terminated if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" return actual_q
def query_api(version=1): """ Respond to a query string """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") if request.method == "GET": q = request.args.get("q", "") else: q = request.form.get("q", "") q = q.strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = bool_from_request(request, "autouppercase", True) result = dict() ql = q.lower() if ql in _SPECIAL_QUERIES or (ql + "?") in _SPECIAL_QUERIES: result["valid"] = True result["qtype"] = "Special" result["q"] = q if ql in _SPECIAL_QUERIES: result["response"] = _SPECIAL_QUERIES[ql] else: result["response"] = _SPECIAL_QUERIES[ql + "?"] else: with SessionContext(commit=True) as session: toklist = tokenize( q, auto_uppercase=q.islower() if auto_uppercase else False ) toklist = list(recognize_entities(toklist, enclosing_session=session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) # if Settings.DEBUG: # # Log the query string as seen by the parser # print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query try: is_query = process_query(session, toklist, result) except: is_query = False result["valid"] = is_query result["q"] = actual_q return better_jsonify(**result)
def VillaEndingIR(self, txt: str, variants: str, node: Node) -> AnnotationDict: # 'læknirinn' á sennilega að vera 'lækninn' # In this case, we need the accusative form # of the token in self._tokens[node.start] tnode = self._terminal_nodes[node.start] suggestion = tnode.accusative_np correct_np = correct_spaces(suggestion) article = " með greini" if "gr" in tnode.all_variants else "" return dict( text="Á sennilega að vera '{0}'".format(correct_np), detail=("Karlkyns orð sem enda á '-ir' í nefnifalli eintölu, " "eins og '{0}', eru rituð " "'{1}' í þolfalli{2}.".format(tnode.canonical_np, correct_np, article)), suggestion=suggestion, )
def append_names(rd, q, prop_func): """ Iterate over query results and add them to the result dictionary rd, assuming that the key is a person name """ for p in q: s = correct_spaces(prop_func(p)) ai = dict( domain=p.domain, uuid=p.id, heading=p.heading, timestamp=p.timestamp, ts=p.timestamp.isoformat()[0:16], url=p.url, ) # Obtain the key within rd that should be updated with new # data. This may be an existing key, a new key or None if no # update is to be performed. s = name_key_to_update(rd, s) if s is not None: rd[s][p.id] = ai # Add to a dict of UUIDs
def query_person_title(session, name): """ Return the most likely title for a person """ def we_dont_like(answer): """ Return False if we don't like this title and would prefer another one """ # Skip titles that simply say that somebody is the husband or # wife of somebody else return answer.startswith(_DONT_LIKE_TITLE) rl = _query_person_titles(session, name) len_rl = len(rl) index = 0 while index < len_rl and we_dont_like(rl[index]["answer"]): index += 1 if index >= len_rl: # If we don't like any answer anyway, go back to the topmost one index = 0 if index >= len_rl: return "", None return correct_spaces( rl[index]["answer"]), rl[index]["sources"][0]["domain"]
def correct_text(self, text: StringIterable, *, only_rare: bool = False) -> str: """Attempt to correct all words within a text, returning the corrected text. If only_rare is True, correction is only attempted on rare words.""" result: List[str] = [] look_back = -MAX_ORDER + 1 for token in tokenize(text): if token.kind == TOK.WORD: if only_rare and not self.is_rare(token.txt): # The word is not rare, so we don't attempt correction result.append(token.txt) else: # Correct the word and return the result result.append( self.correct(token.txt, context=tuple(result[look_back:]))) elif token.txt: result.append(token.txt) elif token.kind in {TOK.S_BEGIN, TOK.S_END}: result.append("") return correct_spaces(" ".join(result))
def _node_text(self, node: Node, original_case: bool = False) -> str: """ Return the text within the span of the node """ def text(t): """ If the token t is a word token, return a lower case version of its text, unless we have a reason to keep the original case, i.e. if it is a lemma that is upper case in BÍN """ if t.kind != TOK.WORD: # Not a word token: keep the original text return t.txt if len(t.txt) > 1 and t.txt.isupper(): # All uppercase: keep it that way return t.txt if t.val and any(m.stofn[0].isupper() for m in t.val): # There is an uppercase lemma for this word in BÍN: # keep the original form return t.txt # No uppercase lemma in BÍN: return a lower case copy return t.txt.lower() first, last = self._node_span(node) text_func = (lambda t: t.txt) if original_case else text return correct_spaces(" ".join( text_func(t) for t in self._tokens[first:last + 1] if t.txt))
def query_entity_def(session, name): """ Return a single (best) definition of an entity """ rl = _query_entity_definitions(session, name) return correct_spaces(rl[0]["answer"]) if rl else ""
def query_person_title(session, name): """ Return the most likely title for a person """ rl = _query_person_titles(session, name) return correct_spaces(rl[0]["answer"]) if rl else ""
def annotate(self, sent: Sentence) -> List[Annotation]: """Returns a list of annotations for a sentence object, containing spelling and grammar annotations of that sentence""" ann: List[Annotation] = [] parsed = sent.deep_tree is not None # Create a mapping from token indices to terminal indices. # This is necessary because not all tokens are included in # the token list that is passed to the parser, and therefore # the terminal-token matches can be fewer than the original tokens. token_to_terminal: Dict[int, int] = {} if parsed: token_to_terminal = { tnode.index: ix for ix, tnode in enumerate(sent.terminal_nodes) if tnode.index is not None } grammar = self.parser.grammar # First, add token-level annotations and count words that occur in BÍN words_in_bin = 0 words_not_in_bin = 0 for ix, t in enumerate(sent.tokens): if t.kind == TOK.WORD: if t.has_meanings: # The word has at least one meaning words_in_bin += 1 else: # The word has no recognized meaning words_not_in_bin += 1 elif t.kind == TOK.PERSON: # Person names count as recognized words words_in_bin += 1 elif t.kind == TOK.ENTITY: # Entity names do not count as recognized words; # we count each enclosed word in the entity name words_not_in_bin += t.txt.count(" ") + 1 # Note: these tokens and indices are the original tokens from # the submitted text, including ones that are not understood # by the parser, such as quotation marks and exotic punctuation annotate = False if getattr(t, "error_code", None): # This is a CorrectToken instance (or a duck typing equivalent) assert isinstance(t, CorrectToken) # Satisfy Mypy annotate = True if parsed and ix in token_to_terminal: # For the call to suggestion_does_not_match(), we need a # BIN_Token instance, which we obtain in a bit of a hacky # way by creating it on the fly bin_token = BIN_Parser.wrap_token(t, ix) # Obtain the original BIN_Terminal instance from the grammar terminal_index = token_to_terminal[ix] terminal_node = sent.terminal_nodes[terminal_index] original_terminal = terminal_node.original_terminal if original_terminal not in grammar.terminals: # At least one case, finna→Finna, gets the terminal "person_kvk" # which isn't found in grammar.terminals! annotate = False continue assert original_terminal is not None terminal = grammar.terminals[original_terminal] assert isinstance(terminal, VariantHandler) if t.suggestion_does_not_match(terminal, bin_token): # If this token is annotated with a spelling suggestion, # do not add it unless it works grammatically annotate = False if annotate: a = Annotation( start=ix, end=ix + t.error_span - 1, code=t.error_code, text=t.error_description, detail=t.error_detail, references=t.error_references, original=t.error_original, suggest=t.error_suggest, ) ann.append(a) # Then, look at the whole sentence num_words = words_in_bin + words_not_in_bin if (num_words > 2 and words_in_bin / num_words < ICELANDIC_RATIO and "E004" not in self._ignore_rules): # The sentence contains less than 50% Icelandic # words: assume it's in a foreign language and discard the # token level annotations ann = [ # E004: The sentence is probably not in Icelandic Annotation( start=0, end=len(sent.tokens) - 1, code="E004", text="Málsgreinin er sennilega ekki á íslensku", detail= "{0:.0f}% orða í henni finnast ekki í íslenskri orðabók". format(words_not_in_bin / num_words * 100.0), ) ] elif not parsed: if self._annotate_unparsed_sentences and "E001" not in self._ignore_rules: # If the sentence couldn't be parsed, # put an annotation on it as a whole. # In this case, we keep the token-level annotations. err_index = sent.err_index or 0 start = max(0, err_index - 1) end = min(len(sent.tokens), err_index + 2) toktext = correct_spaces(" ".join( t.txt for t in sent.tokens[start:end] if t.txt)) ann.append( # E001: Unable to parse sentence Annotation( start=0, end=len(sent.tokens) - 1, code="E001", text="Málsgreinin fellur ekki að reglum", detail="Þáttun brást í kringum {0}. tóka ('{1}')". format(err_index + 1, toktext), )) else: # Successfully parsed: # Add annotations for error-marked nonterminals from the grammar # found in the parse tree ErrorFinder(ann, sent).run() # Run the pattern matcher on the sentence, # annotating questionable patterns PatternMatcher(ann, sent).run() # Sort the annotations by their start token index, # and then by decreasing span length ann.sort(key=lambda a: (a.start, -a.end)) # Eliminate duplicates, i.e. identical annotation # codes for identical spans i = 1 while i < len(ann): a, prev = ann[i], ann[i - 1] if a.code == prev.code and a.start == prev.start and a.end == prev.end: # Identical annotation: remove it from the list del ann[i] else: # Check the next pair i += 1 # Remove ignored annotations ann = [a for a in ann if a.code not in self._ignore_rules] return ann
def annotate(sent: _Sentence) -> List[Annotation]: """ Returns a list of annotations for a sentence object, containing spelling and grammar annotations of that sentence """ ann: List[Annotation] = [] words_in_bin = 0 words_not_in_bin = 0 # First, add token-level annotations for ix, t in enumerate(sent.tokens): if t.kind == TOK.WORD: if t.val: # The word has at least one meaning words_in_bin += 1 else: # The word has no recognized meaning words_not_in_bin += 1 elif t.kind == TOK.PERSON: # Person names count as recognized words words_in_bin += 1 # Note: these tokens and indices are the original tokens from # the submitted text, including ones that are not understood # by the parser, such as quotation marks and exotic punctuation if hasattr(t, "error_code"): assert isinstance(t, CorrectToken) if t.error_code: ann.append( Annotation( start=ix, end=ix + t.error_span - 1, code=t.error_code, text=t.error_description, )) # Then, look at the whole sentence num_words = words_in_bin + words_not_in_bin if num_words > 2 and words_in_bin / num_words < ICELANDIC_RATIO: # The sentence contains less than 50% Icelandic # words: assume it's in a foreign language and discard the # token level annotations ann = [ # E004: The sentence is probably not in Icelandic Annotation( start=0, end=len(sent.tokens) - 1, code="E004", text="Málsgreinin er sennilega ekki á íslensku", detail= "{0:.0f}% orða í henni finnast ekki í íslenskri orðabók". format(words_not_in_bin / num_words * 100.0)) ] elif sent.deep_tree is None: # If the sentence couldn't be parsed, # put an annotation on it as a whole. # In this case, we keep the token-level annotations. err_index = sent.err_index or 0 start = max(0, err_index - 1) end = min(len(sent.tokens), err_index + 2) toktext = correct_spaces(" ".join(t.txt for t in sent.tokens[start:end] if t.txt)) ann.append( # E001: Unable to parse sentence Annotation( start=0, end=len(sent.tokens) - 1, code="E001", text="Málsgreinin fellur ekki að reglum", detail="Þáttun brást í kring um {0}. tóka ('{1}')".format( err_index + 1, toktext))) else: # Successfully parsed: # Add annotations for error-marked nonterminals from the grammar # found in the parse tree ErrorFinder(ann, sent).go() # Run the pattern matcher on the sentence, # annotating questionable patterns PatternMatcher(ann, sent).go() # Sort the annotations by their start token index, # and then by decreasing span length ann.sort(key=lambda a: (a.start, -a.end)) return ann