def try_to_help(cls, query, result): """ Attempt to help the user in the case of a failed query, based on lemmas in the query string """ # Collect a set of lemmas that occur in the query string lemmas = set() with BIN_Db.get_db() as db: for token in query.lower().split(): if token.isalpha(): m = db.meanings(token) if not m: # Try an uppercase version, just in case (pun intended) m = db.meanings(token.capitalize()) if m: lemmas |= set(mm.stofn.lower() for mm in m) # Collect a list of potential help text functions from the query modules help_text_funcs = [] for lemma in lemmas: help_text_funcs.extend([ (lemma, help_text_func) for help_text_func in cls._help_texts.get(lemma, []) ]) if help_text_funcs: # Found at least one help text func matching a lemma in the query # Select a function at random and invoke it with the matched # lemma as a parameter lemma, help_text_func = random.choice(help_text_funcs) result["answer"] = result["voice"] = help_text_func(lemma) result["valid"] = True
def handle_plain_text(q): ql = q.query_lower.rstrip("?") for rx in _MY_NAME_IS_REGEXES: m = re.search(rx, ql) if m: break if not m: return False name = m.group(1).strip() if not name: return False if name.startswith("ekki "): return False with BIN_Db.get_db() as bdb: fn = name.split()[0].title() gender = bdb.lookup_name_gender(fn) a = _RESPONSES[gender].format(fn) voice = a.replace(",", "") q.set_answer(dict(answer=a), a, voice) q.set_qtype(_INTRO_QTYPE) return True
def _capital_query(country, q): """ Generate answer to question concerning a country capital. """ # Get country code cc = isocode_for_country_name(country) if not cc: logging.warning("No CC for country {0}".format(country)) return False # Find capital city, given the country code capital = capital_for_cc(cc) if not capital: return False # Use the Icelandic name for the city ice_cname = icelandic_city_name(capital["name_ascii"]) # Look up genitive country name for voice description bres = BIN_Db().lookup_genitive(country, cat="no") country_gen = bres[0].ordmynd if bres else country answer = ice_cname response = dict(answer=answer) voice = "Höfuðborg {0} er {1}".format(country_gen, answer) q.set_answer(response, answer, voice) q.set_key("Höfuðborg {0}".format(country_gen)) q.set_context(dict(subject=ice_cname)) return True
def dump_tokens(limit): """ Iterate through parsed articles and print a list of tokens and their matched terminals """ dtd = dict() with BIN_Db.get_db() as db, SessionContext(commit=True) as session: # Iterate through the articles q = (session.query(Article).filter(Article.tree != None).order_by( Article.timestamp)) if limit is None: q = q.all() else: q = q[0:limit] for a in q: print( "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}" .format(a)) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.token_lists(): print("\nSentence {0}:".format(ix)) at_start = True for t in toklist: if t.tokentype == "WORD": wrd = t.token[1:-1] td = dtd.get(t.terminal) if td is None: td = TerminalDescriptor(t.terminal) dtd[t.terminal] = td stem = td.stem(db, wrd, at_start) at_start = False print(" {0} {1} {2}".format(wrd, stem, t.terminal)) else: print(" {0.token} {0.cat} {0.terminal}".format(t))
def _addr2nom(address): """ Convert location name to nominative form """ # TODO: Implement more intelligently # This is a tad simplistic and mucks up some things, # e.g. "Ráðhús Reykjavíkur" becomes "Ráðhús Reykjavík" words = address.split() nf = [] for w in words: bin_res = BIN_Db().lookup_nominative(w) if not bin_res and not w.islower(): # Try lowercase form bin_res = BIN_Db().lookup_nominative(w.lower()) if bin_res: nf.append(bin_res[0].ordmynd) else: nf.append(w) return " ".join(nf)
def to_dative(np, *, meaning_filter_func=None): """ Return the noun phrase after casting it from nominative to dative case """ with BIN_Db.get_db() as db: return _to_case( np, db.lookup_word, db.cast_to_dative, meaning_filter_func=meaning_filter_func, )
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id) .join(Article) .join(Root) .filter(Root.visible) .order_by(desc(Article.timestamp))[ 0 : limit * 2 ] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0] ): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def to_accusative(np: str, *, meaning_filter_func: MeaningFilterFunc = None) -> str: """ Return the noun phrase after casting it from nominative to accusative case """ with BIN_Db.get_db() as db: return _to_case( np, db.lookup_word, db.cast_to_accusative, meaning_filter_func=meaning_filter_func, )
def nom2dat(w): """ Look up the dative form of a noun in BÍN. """ if not w: return "" def sort_by_preference(m_list): """ Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3 """ return sorted(m_list, key=lambda m: "2" in m.beyging or "3" in m.beyging) with BIN_Db().get_db() as db: return db.cast_to_dative(w, meaning_filter_func=sort_by_preference)
def tagset(self, word, at_sentence_start = False): """ Return a list of (probability, tag) tuples for the given word """ toklist = list(parse_tokens(" ".join(word))) token = toklist[0] w = word[0] if token.kind == TOK.WORD and token.val is None: try: with BIN_Db.get_db() as db: w, m = db.lookup_word(token.txt, at_sentence_start) except Exception as e: w, m = token.txt, [] token = TOK.Word(w, m) return self._ngram_tagger.tag_single_token(token)
def tagset(self, word, at_sentence_start=False): """ Return a list of (probability, tag) tuples for the given word """ toklist = list(parse_tokens(" ".join(word))) token = toklist[0] w = word[0] if token.kind == TOK.WORD and token.val is None: try: with BIN_Db.get_db() as db: w, m = db.lookup_word(token.txt, at_sentence_start) except Exception as e: w, m = token.txt, [] token = TOK.Word(w, m) return self._ngram_tagger.tag_single_token(token)
def top_authors(days=_TOP_AUTHORS_PERIOD, session=None): end = datetime.utcnow() start = end - timedelta(days=days) authors = BestAuthorsQuery.period(start, end, enclosing_session=session, min_articles=10)[:20] authresult = list() with BIN_Db.get_db() as bindb: for a in authors: name = a[0] gender = bindb.lookup_name_gender(name) if gender == "hk": # Skip unnamed authors (e.g. "Ritstjórn Vísis") continue perc = round(float(a[4]), 2) authresult.append({"name": name, "gender": gender, "perc": perc}) return authresult[:10]
def lookup_best_word(word): """ Look up word in BÍN, pick right one acc. to a criterion. """ with BIN_Db().get_db() as db: def nouns_only(bin_meaning): return bin_meaning.ordfl in ("kk", "kvk", "hk") res = list(filter(nouns_only, db.lookup_nominative(word))) if not res: # Try with uppercase first char capw = word.capitalize() res = list(filter(nouns_only, db.lookup_nominative(capw))) if not res: return None # OK, we have one or more matching nouns if len(res) == 1: m = res[0] else: # TODO: Pick best result m = res[0] # For now wid = m.utg # TODO: If more than one declesion form possible (e.g. gen. björns vs. bjarnar) # we should also list such variations def sort_by_preference(m_list): # Filter out words that don't have the same "utg" i.e. word ID as # the one we successfully looked up in BÍN mns = list(filter(lambda w: w.utg == wid, m_list)) # Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3 return sorted(m_list, key=lambda m: "2" in m.beyging or "3" in m.beyging) # Look up all cases of the word in BÍN nom = m.stofn acc = db.cast_to_accusative(nom, meaning_filter_func=sort_by_preference) dat = db.cast_to_dative(nom, meaning_filter_func=sort_by_preference) gen = db.cast_to_genitive(nom, meaning_filter_func=sort_by_preference) return nom, acc, dat, gen
def _mynameis_handler(q: Query, ql: str) -> bool: """ Handle queries of the form "Ég heiti X", store this information. """ for rx in _MY_NAME_IS_REGEXES: m = re.search(rx, ql) if m: break if m: name = m.group(1).strip() if not name: return False # Clean up name string name = name.split(" og ")[0] # "ég heiti X og blablabla" name = name.split(" hvað ")[0] # "ég heiti X hvað heitir þú" # Handle "ég heiti ekki X" components = name.split() if components[0] == "ekki": q.set_answer(*gen_answer("Hvað heitirðu þá?")) return True # Get first name, look up gender for a gender-tailored response with BIN_Db.get_db() as bdb: fn = components[0].title() gender = bdb.lookup_name_gender(fn) or "hk" answ = _MY_NAME_IS_RESPONSES[gender].format(fn) # Save this info about user to query data table if q.client_id: qdata = dict(full=name.title(), first=fn, gender=gender) q.set_client_data("name", qdata) # Generate answer voice = answ.replace(",", "") q.set_answer(dict(answer=answ), answ, voice) q.query_is_command() return True return False
def test_casting(): """ Test functions to cast words in nominative case to other cases """ from reynir.bindb import BIN_Db db = BIN_Db() assert db.cast_to_accusative("") == "" assert db.cast_to_dative("") == "" assert db.cast_to_genitive("") == "" assert db.cast_to_accusative("xxx") == "xxx" assert db.cast_to_dative("xxx") == "xxx" assert db.cast_to_genitive("xxx") == "xxx" assert db.cast_to_accusative("maðurinn") == "manninn" assert db.cast_to_dative("maðurinn") == "manninum" assert db.cast_to_genitive("maðurinn") == "mannsins" assert db.cast_to_accusative("mennirnir") == "mennina" assert db.cast_to_dative("mennirnir") == "mönnunum" assert db.cast_to_genitive("mennirnir") == "mannanna" assert db.cast_to_accusative("framkvæma") == "framkvæma" assert db.cast_to_dative("framkvæma") == "framkvæma" assert db.cast_to_genitive("framkvæma") == "framkvæma" assert db.cast_to_accusative("stóru") == "stóru" assert db.cast_to_dative("stóru") == "stóru" assert db.cast_to_genitive("stóru") == "stóru" assert db.cast_to_accusative("stóri") == "stóra" assert db.cast_to_dative("stóri") == "stóra" assert db.cast_to_genitive("stóri") == "stóra" assert db.cast_to_accusative("kattarhestur") == "kattarhest" assert db.cast_to_dative("kattarhestur") == "kattarhesti" assert db.cast_to_genitive("kattarhestur") == "kattarhests" assert db.cast_to_accusative("Kattarhestur") == "Kattarhest" assert db.cast_to_dative("Kattarhestur") == "Kattarhesti" assert db.cast_to_genitive("Kattarhestur") == "Kattarhests" f = lambda mm: [m for m in mm if "2" not in m.beyging] assert db.cast_to_accusative("fjórir", meaning_filter_func=f) == "fjóra" assert db.cast_to_dative("fjórir", meaning_filter_func=f) == "fjórum" assert db.cast_to_genitive("fjórir", meaning_filter_func=f) == "fjögurra" assert db.cast_to_accusative("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_dative("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_genitive("Suður-Afríka") == "Suður-Afríku" assert db.cast_to_accusative("Vestur-Þýskaland") == "Vestur-Þýskaland" assert db.cast_to_dative("Vestur-Þýskaland") == "Vestur-Þýskalandi" assert db.cast_to_genitive("Vestur-Þýskaland") == "Vestur-Þýskalands" f = lambda mm: sorted(mm, key=lambda m: "2" in m.beyging or "3" in m.beyging) assert db.cast_to_accusative("Kópavogur", meaning_filter_func=f) == "Kópavog" assert db.cast_to_dative("Kópavogur", meaning_filter_func=f) == "Kópavogi" assert db.cast_to_genitive("Kópavogur", meaning_filter_func=f) == "Kópavogs"
def test(): with BIN_Db.get_db() as db: c = Corrector(db) txts = [ """ FF er flokkur með rasisku ívafi og tilhneygjingu til að einkavinavæða alla fjölmiðla Íslands og færa þar með elítunni að geta ein haft áhrif á skoðanamyndandi áhri í fjölmiðlaheiminum, er ekki viðbúið að svona flokkur gamgi til samstarf við íhaldið eftir kosningar en ekki þessa vondu félagshyggjuflokka """, """ fæ alveg hræðileg drauma vegna fyrri áfalla og það hjálpar mér að ná góðum svef og þar með betri andlegri lýðan og líka til að auka matarlist. Tek samt skýrt fram að ég hef bæði missnotað kannabis og ekki. Hef engan áhuga á að vera undir áhrifum kannabis alla dag. Mikil munur á að nota og missnota ! """, """ Bæði , lyf gegn áfengissyki (leiða) , mér hefur ekki leiðst mikið seinustu 30 ár. Gegn Taugaveiklun, konan hamrar á mér alla daga , skærur hennar eru langar og strangar. En ef ég fæ eina pípu og gríp gitarinn má hún tuða í mér klukkutímum saman.Ég er bæði rólegur og læri hratt á gítarinn, eftir 10 ára hjónaband er ég bara ótrúlega heill og stefni hátt. Ég og gitarinn erum orðnir samvaxnir. Auðvitað stefnum við á skilnað og þá mun ég sakna skalaæfinganna. """, """ biddu nu hæg - var Kvennalistinn eins malefnis hreyfing. Hvað attu við - ef þu telur malefnið hafa verið eitt hvert var það? Kannski leikskola fyrir öll börn? Sömu laun fyrir sömu störf? Að borgarskipulag tæki mið af þörfum beggja kynja? Að kynjagleraugu væru notuð við gerð fjarlaga? Að þjoðfelagið opnaði augun fyrir kynferðsofbeldinu og sifjaspellum? (hvorutveggja sagt aðeins viðgangast i utlöndum). Þetta eru aðeins örfa dæmi um malefni sem brunnu a okkur og við börðumst fyrir. Ekki ertu i alvöru að tala framlag okkur niður. Tæplega telurðu það EITT malefni þo að i grunninn hafi baratta okkar sem stoðum að Kvennaframboðinu og -listanum gengið ut a að ,,betri,, helmingur þjoðarinnar öðlast - ekki bara i orði heldur einnig a borði - sömu rettindi og raðandi helmingurinn """, """ Salvör ekki standa i að reyna að klora yfir mistök þin. Reynsluheimur kvenna visar að sjalsögðu til þess að helmingur mannkynsins - -konur - er olikur hinum helmingnum bæði sökum lffræðilegs munar og þess að þær eru gerðar að konum (sb de Beauvoir) þe fra frumbernsku er drengjum hrosað fyrir annað en stulkum og væntingar foreldra eru aðrar til dætra en sona og auk þess er ætlast til að dætur læri af mæðrum en synir af feðrum. Það er þetta sem gerir konur - helming mannkynsins - frabrugðna körlum sem hafa fra örofi alda verið ,,raðandi,, kynið. Það var gegn þvi orettlæti að reynsluheimur kvenna speglaðist ekki i politiskum akvörðunum sem við sem stofnaði Kvennafranboðið og - listann börðumst gegn - a öllum vigstöðvum. Að skilgreina barattu okkar Kvennalistans - fyrir rettindum halfrar þjoðarinnar til að skapa ,,rettlatara samfelag,, - sem eins mals flokk er fjarstæða. """, ] def linebreak(txt, margin=80, left_margin=0): """ Return a nicely column-formatted string representation of the given text, where each line is not longer than the given margin (if possible). A left margin can be optionally added, as a sequence of spaces. The lines are joined by newlines ('\n') but there is no trailing newline. """ result = [] line: List[str] = [] len_line = 0 for wrd in txt.split(): if len_line + 1 + len(wrd) > margin: result.append(" ".join(line)) line = [] len_line = 0 line.append(wrd) len_line += 1 + len(wrd) if line: result.append(" ".join(line)) return "\n".join(" " * left_margin + line for line in result) t0 = time.time() for t in txts: print("\nOriginal:\n") print(linebreak(t, left_margin=8)) print("\nCorrected:\n") print(linebreak(c.correct_text(t), left_margin=8)) t1 = time.time() print("\nTotal time: {0:.2f} seconds".format(t1 - t0))
) except socket_error as e: if e.errno == errno.EADDRINUSE: # Address already in use logging.error( "Reynir is already running at host {0}:{1}".format( Settings.HOST, Settings.PORT ) ) sys.exit(1) else: raise finally: ArticleProxy.cleanup() BIN_Db.cleanup() else: # Suppress information log messages from Werkzeug werkzeug_log = logging.getLogger("werkzeug") if werkzeug_log: werkzeug_log.setLevel(logging.WARNING) # Log our startup log_str = "Reynir instance starting with host={0}:{1}, db_hostname={2} on Python {3}".format( Settings.HOST, Settings.PORT, Settings.DB_HOSTNAME, sys.version.replace("\n", " "), )
def handle_plain_text(q): """ Handle a plain text query, contained in the q parameter which is an instance of the query.Query class. Returns True if the query was handled, and in that case the appropriate properties on the Query instance have been set, such as the answer and the query type (qtype). If the query is not recognized, returns False. """ ql = q.query_lower.rstrip("?") # Timezone being asked about tz = None # Whether user asked for the time in a particular location specific_desc = None if ql in _TIME_QUERIES: # Use location to determine time zone tz = timezone4loc(q.location, fallback="IS") elif ql.startswith("hvað er klukkan á ") or ql.startswith( "hvað er klukkan í "): # Query about the time in a particular location, i.e. country or city loc = ql[18:] # Cut away question prefix, leaving only placename # Capitalize each word in country/city name loc = capitalize_placename(loc) # Look up nominative # This only works for single-word city/country names found # in BÍN and could be improved (e.g. fails for "Nýju Jórvík") bin_res = BIN_Db().lookup_nominative(loc) words = [m.stofn for m in bin_res] words.append( loc) # In case it's not in BÍN (e.g. "New York", "San José") # Check if any word is a recognised country or city name for w in words: cc = isocode_for_country_name(w) if cc and cc in country_timezones: # Look up country timezone # Use the first timezone although some countries have more than one # The timezone list returned by pytz is ordered by "dominance" tz = country_timezones[cc][0] else: # It's not a country name, look up in city database info = lookup_city_info(w) if info: top = info[0] location = (top.get("lat_wgs84"), top.get("long_wgs84")) tz = timezone4loc(location) if tz: # We have a timezone break # "Klukkan í Lundúnum er" - Used for voice answer specific_desc = "{0} er".format(ql[8:]) # Beautify query by capitalizing the country/city name q.set_beautified_query("{0}{1}?".format(q.beautified_query[:18], loc)) # We have a timezone. Return formatted answer. if tz: now = datetime.now(timezone(tz)) desc = specific_desc or "Klukkan er" # Create displayable answer answer = "{0:02}:{1:02}".format(now.hour, now.minute) # A detailed response object is usually a list or a dict response = dict(answer=answer) # A voice answer is a plain string that will be # passed as-is to a voice synthesizer voice = "{0} {1}:{2:02}.".format(desc, now.hour, now.minute) q.set_qtype(_TIME_QTYPE) q.set_key(tz) # Query key is the timezone q.set_answer(response, answer, voice) return True return False
"\"Önnu kveið # fyrir skóladeginum.\"\n") print("\nUpphaflegur texti: '{0}'".format(txt)) for pg in rc.check(txt, split_paragraphs=True): for sent in pg: display_annotations(sent) print("---") sys.exit(0) import time from reynir_correct.spelling import Corrector from reynir.bindb import BIN_Db with BIN_Db.get_db() as db: c = Corrector(db) # type: Corrector def test(c, word): t0 = time.time() result = list(c.subs(word)) valid = [r for r in result if r in c] t1 = time.time() print("Word: {0}, combinations: {1}, time {2:.3f} secs".format( word, len(result), t1 - t0)) print(result) print(valid) test(c, "hæstarréttarlögmaður")
def QGeoSubject(node, params, result): n = capitalize_placename(result._text) bin_res = BIN_Db().lookup_nominative(n) res = bin_res[0].stofn if bin_res else n result.subject = res
def recognize_entities(token_stream, enclosing_session=None, token_ctor=TOK): """ Parse a stream of tokens looking for (capitalized) entity names The algorithm implements N-token lookahead where N is the length of the longest entity name having a particular initial word. Adds a named entity recognition layer on top of the reynir.bintokenizer.tokenize() function. """ # Token queue tq = [] # Phrases we're considering. Note that an entry of None # indicates that the accumulated phrase so far is a complete # and valid known entity name. state = defaultdict(list) # Entitiy definition cache ecache = dict() # Last name to full name mapping ('Clinton' -> 'Hillary Clinton') lastnames = dict() with BIN_Db.get_db() as db, SessionContext( session=enclosing_session, commit=True, read_only=True ) as session: def fetch_entities(w, fuzzy=True): """ Return a list of entities matching the word(s) given, exactly if fuzzy = False, otherwise also as a starting word(s) """ try: q = session.query(Entity.name, Entity.verb, Entity.definition) if fuzzy: q = q.filter(Entity.name.like(w + " %") | (Entity.name == w)) else: q = q.filter(Entity.name == w) return q.all() except OperationalError as e: logging.warning("SQL error in fetch_entities(): {0}".format(e)) return [] def query_entities(w): """ Return a list of entities matching the initial word given """ e = ecache.get(w) if e is None: ecache[w] = e = fetch_entities(w) return e def lookup_lastname(lastname): """ Look up a last name in the lastnames registry, eventually without a possessive 's' at the end, if present """ fullname = lastnames.get(lastname) if fullname is not None: # Found it return fullname # Try without a possessive 's', if present if lastname.endswith("s"): return lastnames.get(lastname[0:-1]) # Nope, no match return None def flush_match(): """ Flush a match that has been accumulated in the token queue """ if len(tq) == 1 and lookup_lastname(tq[0].txt) is not None: # If single token, it may be the last name of a # previously seen entity or person return token_or_entity(tq[0]) # Reconstruct original text behind phrase ename = " ".join([t.txt for t in tq]) # We don't include the definitions in the token - they should be looked up # on the fly when processing or displaying the parsed article return token_ctor.Entity(ename) def token_or_entity(token): """ Return a token as-is or, if it is a last name of a person that has already been mentioned in the token stream by full name, refer to the full name """ assert token.txt[0].isupper() tfull = lookup_lastname(token.txt) if tfull is None: # Not a last name of a previously seen full name return token if tfull.kind != TOK.PERSON: # Return an entity token with no definitions # (this will eventually need to be looked up by full name when # displaying or processing the article) return token_ctor.Entity(token.txt) # Return the full name meanings return token_ctor.Person(token.txt, tfull.val) try: while True: token = next(token_stream) if not token.txt: # token.kind != TOK.WORD: if state: if None in state: yield flush_match() else: yield from tq tq = [] state = defaultdict(list) yield token continue # Look for matches in the current state and build a new state newstate = defaultdict(list) w = token.txt # Original word def add_to_state(slist, entity): """ Add the list of subsequent words to the new parser state """ wrd = slist[0] if slist else None rest = slist[1:] newstate[wrd].append((rest, entity)) if w in state: # This matches an expected token tq.append(token) # Add to lookahead token queue # Add the matching tails to the new state for sl, entity in state[w]: add_to_state(sl, entity) # Update the lastnames mapping fullname = " ".join([t.txt for t in tq]) parts = fullname.split() # If we now have 'Hillary Rodham Clinton', # make sure we delete the previous 'Rodham' entry for p in parts[1:-1]: if p in lastnames: del lastnames[p] if parts[-1][0].isupper(): # 'Clinton' -> 'Hillary Rodham Clinton' lastnames[parts[-1]] = token_ctor.Entity(fullname) else: # Not a match for an expected token if state: if None in state: # We have an accumulated match, but if the next token # is an uppercase word without a BÍN meaning, we # append it to the current entity regardless. # This means that 'Charley Lucknow' is handled as a single # new entity name even if 'Charley' already exists # as an entity. while w and w[0].isupper() and not token.val: # Append to the accumulated token queue, which will # be squashed to a single token in flush_match() tq.append(token) token = next(token_stream) w = token.txt # Flush the already accumulated match yield flush_match() else: yield from tq tq = [] # Add all possible new states for entity names # that could be starting weak = True cnt = 1 upper = w and w[0].isupper() parts = None if upper and " " in w: # For all uppercase phrases (words, entities, persons), # maintain a map of last names to full names parts = w.split() lastname = parts[-1] # Clinton -> Hillary [Rodham] Clinton if lastname[0].isupper(): # Look for Icelandic patronyms/matronyms _, m = db.lookup_word(lastname, False) if m and any(mm.fl in {"föð", "móð"} for mm in m): # We don't store Icelandic patronyms/matronyms # as surnames pass else: lastnames[lastname] = token if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT: if " " in w: # w may be a person name with more than one embedded word # parts is assigned in the if statement above cnt = len(parts) elif not token.val or ("-" in token.val[0].stofn): # No BÍN meaning for this token, or the meanings # were constructed by concatenation (indicated by a hyphen # in the stem) weak = False # Accept single-word entity references # elist is a list of Entity instances elist = query_entities(w) else: elist = [] if elist: # This word might be a candidate to start an entity reference candidate = False for e in elist: # List of subsequent words in entity name sl = e.name.split()[cnt:] if sl: # Here's a candidate for a longer entity reference # than we already have candidate = True if sl or not weak: add_to_state(sl, e) if weak and not candidate: # Found no potential entity reference longer than this token # already is - and we have a BÍN meaning for it: # Abandon the effort assert not newstate assert not tq yield token_or_entity(token) else: # Go for it: Initialize the token queue tq = [token] else: # Not a start of an entity reference: simply yield the token assert not tq if upper: # Might be a last name referring to a full name yield token_or_entity(token) else: yield token # Transition to the new state state = newstate except StopIteration: # Token stream is exhausted pass # Yield an accumulated match if present if state: if None in state: yield flush_match() else: yield from tq tq = [] # print("\nEntity cache:\n{0}".format("\n".join("'{0}': {1}".format(k, v) for k, v in ecache.items()))) # print("\nLast names:\n{0}".format("\n".join("{0}: {1}".format(k, v) for k, v in lastnames.items()))) assert not tq
port=Settings.PORT, debug=Settings.DEBUG, use_reloader=not ptvsd_attached, extra_files=extra_files, ) except socket_error as e: if e.errno == errno.EADDRINUSE: # Address already in use logging.error( "Greynir web app is already running at host {0}:{1}".format( Settings.HOST, Settings.PORT)) sys.exit(1) else: raise finally: ArticleProxy.cleanup() BIN_Db.cleanup() else: app.config["PRODUCTION"] = True # Suppress information log messages from Werkzeug werkzeug_log = logging.getLogger("werkzeug") if werkzeug_log: werkzeug_log.setLevel(logging.WARNING) # Log our startup log_str = ("Greynir instance starting with " "host={0}:{1}, db_host={2}:{3} on Python {4}".format( Settings.HOST, Settings.PORT, Settings.DB_HOSTNAME,
def wordfreq(): """ Return word frequency chart data for a given time period. """ resp = dict(err=True) # Create datetime objects from query string args try: date_fmt = "%Y-%m-%d" date_from = datetime.strptime(request.args.get("date_from"), date_fmt) date_to = datetime.strptime(request.args.get("date_to"), date_fmt) except Exception as e: logging.warning("Failed to parse date arg: {0}".format(e)) return better_jsonify(**resp) # Words parameter should be one or more word lemmas (w. optional category) warg = request.args.get("words") if not warg: return better_jsonify(**resp) # Split on comma or whitespace, limit to max 6 words warg = warg.strip().replace(" ", " ").replace(",", " ") words = [w.strip() for w in warg.split()][:6] # Word categories can be specified thus: "maður:kk" words = [tuple(w.split(":")) for w in words] with BIN_Db.get_db() as db: def cat4word(w): _, meanings = db.lookup_word(w, auto_uppercase=True) if meanings: # Give precedence to lemmas, e.g. interpret "reima" as # verb rather than gen. pl. of fem. noun "reim" lemmas = list(filter(lambda x: x.stofn == w, meanings)) return lemmas[0].ordfl if lemmas else meanings[0].ordfl return "hk" # Get word category (ordfl) for each word, if needed valid_cats = ["kk", "kvk", "hk", "lo", "so"] for i, w in enumerate(words): if len(w) < 2 or w[1] not in valid_cats: words[i] = (w[0], cat4word(w[0])) colors = list(_LINE_COLORS) # Generate date labels now = datetime.utcnow() delta = date_to - date_from labels = [date_from + timedelta(days=i) for i in range(delta.days + 1)] with changedlocale(category="LC_TIME"): labels = [ l.strftime("%-d. %b") if l.year == now.year else l.strftime("%-d. %b %Y") for l in labels ] # More human readble description of word categories CAT_DESC = { "kk": "kk. no.", "kvk": "kvk. no.", "hk": "hk. no.", "lo": "lo.", "so": "so.", } # Create datasets for front-end chart with SessionContext(commit=False) as session: data = dict(labels=labels, datasets=[]) for w in words: # Look up frequency of word for the given period res = WordFrequencyQuery.fetch(w[0], w[1], date_from, date_to, enclosing_session=session) # Generate data and config for chart label = "{0} ({1})".format(w[0], CAT_DESC.get(w[1])) ds = dict(label=label, fill=False, lineTension=0) ds["borderColor"] = ds["backgroundColor"] = colors.pop(0) ds["data"] = [r[1] for r in res] data["datasets"].append(ds) # Create response resp["err"] = False resp["data"] = data # Update word list client-side resp["words"] = ", ".join([":".join(w) for w in words]) return better_jsonify(**resp)