def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() bindb = BIN_Db.get_db() with SessionContext(commit=True) as session: q = session.query(Person.name, Person.title, Person.article_url, Article.id) \ .join(Article).join(Root) \ .filter(Root.visible) \ .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or len(p.title) > len(toplist[p.name][0]): toplist[p.name] = (correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name)) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted([ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]))
def sentence(state, result): """ Called when sentence processing is complete """ q = state["query"] if "qtype" not in result: q.set_error("E_QUERY_NOT_UNDERSTOOD") return # Successfully matched a query type try: with changedlocale(category="LC_TIME"): for k, handler_func in _Q2FN_MAP: if k in result: # Hand query object over to handler function handler_func(q, result) # Lowercase the query string to avoid 'Dagur' being # displayed with a capital D q.lowercase_beautified_query() q.set_qtype(_DATE_QTYPE) break except Exception as e: logging.warning( "Exception {0} while processing date query '{1}'".format(e, q.query) ) q.set_error("E_EXCEPTION: {0}".format(e))
def iceformat_float(fp_num: float, decimal_places: int = 2, strip_zeros: bool = True) -> str: """ Convert number to Icelandic decimal format string. """ with changedlocale(category="LC_NUMERIC"): fmt = "%.{0}f".format(decimal_places) res = locale.format_string(fmt, float(fp_num), grouping=True).replace(" ", ".") return strip_trailing_zeros(res) if strip_zeros else res
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id) .join(Article) .join(Root) .filter(Root.visible) .order_by(desc(Article.timestamp))[ 0 : limit * 2 ] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0] ): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id) .join(Article) .join(Root) .filter(Root.visible) .order_by(desc(Article.timestamp))[ 0 : limit * 2 ] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0] ): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def make_response_list(rd): """ Create a response list from the result dictionary rd """ # Now we have a dictionary of distinct results, along with their URLs # Go through the results and delete later ones # that are contained within earlier ones rl = list(rd.keys()) for i in range(len(rl) - 1): ri = rl[i] if ri is not None: for j in range(i + 1, len(rl)): rj = rl[j] if rj is not None: if rj.lower() in ri.lower(): rd[ri].update(rd[rj]) del rd[rj] rl[j] = None # Go again through the results and delete earlier ones # that are contained within later ones rl = list(rd.keys()) for i in range(len(rl) - 1): ri = rl[i] for j in range(i + 1, len(rl)): rj = rl[j] if ri.lower() in rj.lower(): rd[rj].update(rd[ri]) del rd[ri] break with changedlocale() as strxfrm: def sort_articles(articles): """ Sort the individual article URLs so that the newest one appears first """ return sorted(articles.values(), key=lambda x: x.ts, reverse=True) rl = sorted([(s, sort_articles(articles)) for s, articles in rd.items()], key=lambda x: (-len(x[1]), strxfrm(x[0])) ) # Sort by number of URLs in article dict # If we have 5 or more titles/definitions with more than one associated URL, # cut off those that have only one source URL if len(rl) > _CUTOFF_AFTER and len(rl[_CUTOFF_AFTER][1]) > 1: rl = [val for val in rl if len(val[1]) > 1] # Crop the article url lists down to _MAX_URLS for i, val in enumerate(rl): if len(val[1]) > _MAX_URLS: rl[i] = (val[0], val[1][0:_MAX_URLS]) return rl[0:_MAXLEN_ANSWER]
def main(): """ Main program """ try: Settings.read("config/Reynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e), file = sys.stderr) return 2 verbs = read_verbs("resources/sagnir.txt") with changedlocale() as strxfrm: verbs_sorted = sorted(verbs.values(), key = lambda x: strxfrm(x.nom)) print("#\n# Verb list generated by verbs.py from resources/sagnir.txt") print("#", str(datetime.utcnow())[0:19], "\n#\n") display(verbs_sorted) print("\n# Total: {0} distinct verbs\n".format(len(verbs))) # Check which verbs are missing or different in Verbs.conf #count = check_missing(verbs_sorted) #print("\n# Total: {0} missing verb forms\n".format(count)) return 0
def main(): """ Main program """ try: Settings.read("config/Greynir.conf") except ConfigError as e: print("Configuration error: {0}".format(e), file=sys.stderr) return 2 verbs = read_verbs("resources/sagnir.txt") with changedlocale() as strxfrm: verbs_sorted = sorted(verbs.values(), key=lambda x: strxfrm(x.nom)) print("#\n# Verb list generated by verbs.py from resources/sagnir.txt") print("#", str(datetime.utcnow())[0:19], "\n#\n") display(verbs_sorted) print("\n# Total: {0} distinct verbs\n".format(len(verbs))) # Check which verbs are missing or different in Verbs.conf #count = check_missing(verbs_sorted) #print("\n# Total: {0} missing verb forms\n".format(count)) return 0
def sentence(state, result): """ Called when sentence processing is complete """ q = state["query"] if "qtype" not in result: q.set_error("E_QUERY_NOT_UNDERSTOOD") return # Successfully matched a query type try: with changedlocale(category="LC_TIME"): # Get timezone and date # TODO: Restore correct timezone handling # tz = timezone4loc(q.location, fallback="IS") now = datetime.utcnow() # datetime.now(timezone(tz)) qkey = None # Asking about current date if "now" in result: date_str = now.strftime("%A %-d. %B %Y") answer = date_str.capitalize() voice = "Í dag er {0}".format(date_str) # Put a spelled-out ordinal number instead of the numeric one # to get the grammar right voice = re.sub(r" \d+\. ", " " + _DAY_INDEX_NOM[now.day] + " ", voice) response = dict(answer=answer) qkey = "CurrentDate" # Asking about period until/since a given date elif ("until" in result or "since" in result) and "target" in result: target = result.target # target.replace(tzinfo=timezone(tz)) # Find the number of days until target date (response, answer, voice) = howlong_desc_answ(target) qkey = "FutureDate" if "until" in result else "SinceDate" elif "when" in result and "target" in result: # TODO: Fix this so it includes weekday, e.g. # "Sunnudaginn 1. október" # Use plural 'eru' for 'páskar' is_verb = "er" if "is_verb" not in result else result.is_verb date_str = (result.desc + " " + is_verb + " " + result.target.strftime("%-d. %B")) answer = voice = date_str[0].upper() + date_str[1:].lower() # Put a spelled-out ordinal number instead of the numeric one, # in accusative case voice = re.sub(r"\d+\. ", _DAY_INDEX_ACC[result.target.day] + " ", voice) response = dict(answer=answer) else: # Shouldn't be here raise Exception("Unable to handle date query") q.set_key(qkey) q.set_answer(response, answer, voice) # Lowercase the query string to avoid 'Dagur' being # displayed with a capital D q.lowercase_beautified_query() q.set_qtype(_DATE_QTYPE) except Exception as e: logging.warning("Exception while processing date query: {0}".format(e)) q.set_error("E_EXCEPTION: {0}".format(e))
def chart_stats(session=None, num_days: int = 7) -> Dict[str, Any]: """ Return scraping and parsing stats for charts """ today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) labels = [] sources: Dict[str, List[int]] = {} parsed_data = [] query_data = [] # Get article count for each source for each day, and query count for each day # We change locale to get localized date weekday/month names with changedlocale(category="LC_TIME"): for n in range(0, num_days): days_back = num_days - n - 1 start = today - timedelta(days=days_back) end = today - timedelta(days=days_back - 1) # Generate label dfmtstr = "%-d. %b" if start < today - timedelta( days=6) else "%a %-d. %b" labels.append(start.strftime(dfmtstr)) sent = 0 parsed = 0 # Get article count per source for day # Also collect parsing stats for parse % chart q = ChartsQuery.period(start, end, enclosing_session=session) for (name, cnt, s, p) in q: sources.setdefault(name, []).append(cnt) sent += s parsed += p percent = round((parsed / sent) * 100, 2) if sent else 0 parsed_data.append(percent) # Get query count for day q = QueriesQuery.period(start, end, enclosing_session=session) query_data.append(q[0][0]) # Create datasets for bar chart datasets = [] article_count = 0 for k, v in sorted(sources.items()): color = _SOURCE_ROOT_COLORS.get(k, "#000") datasets.append({"label": k, "backgroundColor": color, "data": v}) article_count += sum(v) # Calculate averages scrape_avg = article_count / num_days parse_avg = sum(parsed_data) / num_days query_avg = sum(query_data) / num_days return { "scraped": { "labels": labels, "datasets": datasets, "avg": scrape_avg }, "parsed": { "labels": labels, "datasets": [{ "data": parsed_data }], "avg": parse_avg, }, "queries": { "labels": labels, "datasets": [{ "data": query_data }], "avg": query_avg, }, }
def format_icelandic_float(fp_num): """ Convert number to Icelandic decimal format. """ with changedlocale(category="LC_NUMERIC"): res = locale.format_string("%.2f", fp_num, grouping=True).replace(" ", ".") return strip_trailing_zeros(res)
def chart_stats(session=None, num_days=7): """ Return scraping and parsing stats for charts """ # TODO: This should be put in a column in the roots table colors = { "Kjarninn": "#f17030", "RÚV": "#dcdcdc", "Vísir": "#3d6ab9", "Morgunblaðið": "#020b75", "Eyjan": "#ca151c", "Kvennablaðið": "#900000", "Stundin": "#ee4420", "Hringbraut": "#44607a", "Fréttablaðið": "#002a61", "Hagstofa Íslands": "#818285", "DV": "#ed1c24", } today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) labels = [] sources = {} parsed_data = [] query_data = [] # Get article count for each source for each day, and query count for each day # We change locale to get localized date weekday/month names with changedlocale(category="LC_TIME"): for n in range(0, num_days): days_back = num_days - n - 1 start = today - timedelta(days=days_back) end = today - timedelta(days=days_back - 1) # Generate label if start < today - timedelta(days=6): labels.append(start.strftime("%-d. %b")) else: labels.append(start.strftime("%A").capitalize()) sent = 0 parsed = 0 # Get article count per source for day # Also collect parsing stats for parse % chart q = ChartsQuery.period(start, end, enclosing_session=session) for (name, cnt, s, p) in q: sources.setdefault(name, []).append(cnt) sent += s parsed += p percent = round((parsed / sent) * 100, 2) if sent else 0 parsed_data.append(percent) # Get query count for day q = QueriesQuery.period(start, end, enclosing_session=session) query_data.append(q[0][0]) # Create datasets for bar chart datasets = [] article_count = 0 for k, v in sorted(sources.items()): color = colors.get(k, "#000") datasets.append({"label": k, "backgroundColor": color, "data": v}) article_count += sum(v) # Calculate averages scrape_avg = article_count / num_days parse_avg = sum(parsed_data) / num_days query_avg = sum(query_data) / num_days return { "scraped": { "labels": labels, "datasets": datasets, "avg": scrape_avg }, "parsed": { "labels": labels, "datasets": [{ "data": parsed_data }], "avg": parse_avg, }, "queries": { "labels": labels, "datasets": [{ "data": query_data }], "avg": query_avg, }, }
def chart_stats(session=None, num_days=7): """ Return scraping and parsing stats for charts """ # TODO: This should be put in a column in the roots table colors = { "Kjarninn": "#f17030", "RÚV": "#dcdcdc", "Vísir": "#3d6ab9", "Morgunblaðið": "#020b75", "Eyjan": "#ca151c", "Kvennablaðið": "#900000", } today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) labels = [] sources = {} parsed_data = [] # Get article count for each source for each day with changedlocale(category="LC_TIME"): for n in range(0, num_days): days_back = num_days - n - 1 start = today - timedelta(days=days_back) end = today - timedelta(days=days_back - 1) # Generate label if start < today - timedelta(days=6): labels.append(start.strftime("%-d. %b")) else: labels.append(start.strftime("%A")) sent = 0 parsed = 0 # Get article count per source for day # Also collect parsing stats q = ChartsQuery.period(start, end, enclosing_session=session) for (name, cnt, s, p) in q: sources.setdefault(name, []).append(cnt) sent += s parsed += p percent = round((parsed / sent) * 100, 2) if sent else 0 parsed_data.append(percent) # Create datasets for bar chart datasets = [] for k, v in sorted(sources.items()): color = colors.get(k, "#000") datasets.append({"label": k, "backgroundColor": color, "data": v}) return { "scraped": { "labels": labels, "datasets": datasets }, "parsed": { "labels": labels, "datasets": [{ "data": parsed_data }] }, }
def top_news(topic=None, offset=0, limit=_TOP_NEWS_LENGTH): """ Return a list of articles (with a particular topic) in chronologically reversed order. """ toplist = [] topdict = dict() with SessionContext(commit=True) as session: q = (session.query(Article).join(Root).filter( Article.tree != None).filter(Article.timestamp != None).filter( Article.timestamp <= datetime.utcnow()).filter( Article.heading > "").filter( Article.num_sentences > 0).filter( Root.visible == True)) if topic is not None: # Filter by topic identifier q = q.join(ArticleTopic).join(Topic).filter( Topic.identifier == topic) q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit) class ArticleDisplay: """ Utility class to carry information about an article to the web template """ def __init__( self, heading, timestamp, url, uuid, num_sentences, num_parsed, icon, localized_date, source, ): self.heading = heading self.timestamp = timestamp self.url = url self.uuid = uuid self.num_sentences = num_sentences self.num_parsed = num_parsed self.icon = icon self.localized_date = localized_date self.source = source @property def width(self): """ The ratio of parsed sentences to the total number of sentences, expressed as a percentage string """ if self.num_sentences == 0: return "0%" return "{0}%".format( (100 * self.num_parsed) // self.num_sentences) @property def time(self): return self.timestamp.isoformat()[11:16] @property def date(self): if datetime.today().year == self.timestamp.year: return self.localized_date return self.fulldate @property def fulldate(self): return self.localized_date + self.timestamp.strftime(" %Y") with changedlocale(category="LC_TIME"): for a in q: # Instantiate article objects from results source = a.root.domain icon = source + ".png" locdate = a.timestamp.strftime("%-d. %b") d = ArticleDisplay( heading=a.heading, timestamp=a.timestamp, url=a.url, uuid=a.id, num_sentences=a.num_sentences, num_parsed=a.num_parsed, icon=icon, localized_date=locdate, source=source, ) toplist.append(d) return toplist
def test_query_api(client): """ Make various query API calls and validate response. """ c = client google_key = has_google_api_key() # First, make sure nonsensical queries are not answered qstr = {"q": "blergh smergh vlurgh"} r = c.get("/query.api?" + urlencode(qstr)) assert r.content_type.startswith(API_CONTENT_TYPE) assert r.is_json json = r.get_json() assert "valid" in json assert json["valid"] == True assert "error" in json assert "answer" not in json # Person and entity title queries are tested using a dummy database # populated with data from SQL file in tests/files/ # Builtin module: title json = qmcall(c, {"q": "hver er viðar þorsteinsson", "voice": True}, "Person") assert json["voice"].startswith("Viðar Þorsteinsson er ") assert json["voice"].endswith(".") # Builtin module: title json = qmcall(c, {"q": "hver er björn þorsteinsson", "voice": True}, "Person") assert json["voice"].startswith("Björn Þorsteinsson er ") assert json["voice"].endswith(".") # Builtin module: person json = qmcall(c, {"q": "hver er forsætisráðherra", "voice": True}, "Title") assert json["voice"].startswith("Forsætisráðherra er ") assert json["voice"].endswith(".") # Builtin module: person w. title w. uppercase name # json = qmcall(c, {"q": "hver er forstjóri sjóvá", "voice": True}, "Title") # assert json["voice"].startswith("Forstjóri") and "Jón Jónsson" in json["voice"] # Builtin module: entities json = qmcall(c, {"q": "hvað er Nox Medical"}, "Entity") assert "nýsköpunarfyrirtæki" in json["answer"].lower() assert json["key"] == "Nox Medical" # Arithmetic module ARITHM_QUERIES = { "hvað er fimm sinnum tólf": "60", "hvað er 12 sinnum 12?": "144", "hvað er nítján plús 3": "22", "hvað er nítján plús þrír": "22", "hvað er nítján + 3": "22", "hvað er 19 + 3": "22", "hvað er 19 + þrír": "22", "hvað er hundrað mínus sautján": "83", "hvað er hundrað - sautján": "83", "hvað er 100 - sautján": "83", "hvað er 100 - 17": "83", "hvað er 17 deilt með fjórum": "4,25", "hvað er 17 / 4": "4,25", "hvað er 18 deilt með þrem": "6", "hvað er 18 / þrem": "6", "hvað er 18 / 3": "6", "hver er kvaðratrótin af 256": "16", "hvað er 12 í þriðja veldi": "1728", "hvað eru tveir í tíunda veldi": "1024", "hvað eru 17 prósent af 20": "3,4", "hvað er 7000 deilt með 812": "8,62", "hvað er þrisvar sinnum sjö": "21", "hvað er fjórðungur af 28": "7", "hvað er einn tuttugasti af 192": "9,6", "reiknaðu 7 sinnum 7": "49", "reiknaðu 7 x 7": "49", "reiknaðu sjö x 7": "49", "reiknaðu nítján x sjö": "133", "geturðu reiknað kvaðratrótina af 9": "3", "hvað er 8900 með vaski": "11.036", "hvað eru 7500 krónur með virðisaukaskatti": "9.300", "hvað er 9300 án vask": "7.500", "hvað er pí deilt með pí": "1", "hvað er pí / pí": "1", "hvað er pí í öðru veldi": "9,87", "hvað er tíu deilt með pí": "3,18", } for q, a in ARITHM_QUERIES.items(): json = qmcall(c, {"q": q}, "Arithmetic") assert json["answer"] == a json = qmcall( c, {"q": "hvað er pí", "client_id": DUMMY_CLIENT_ID, "private": False}, "PI" ) assert "π" in json["answer"] assert "3,14159" in json["answer"] json = qmcall( c, {"q": "hvað er það sinnum tveir", "client_id": DUMMY_CLIENT_ID}, "Arithmetic" ) assert json["answer"].startswith("6,") # Bus module json = qmcall( c, {"q": "hvaða stoppistöð er næst mér", "voice": True}, "NearestStop" ) assert json["answer"] == "Fiskislóð" assert json["voice"] == "Næsta stoppistöð er Fiskislóð; þangað eru 310 metrar." json = qmcall( c, {"q": "hvenær er von á vagni númer 17", "voice": True, "test": False}, "ArrivalTime", ) assert json["answer"] == "Staðsetning óþekkt" # No location info available # Counting module json = qmcall(c, {"q": "teldu frá einum upp í tíu"}, "Counting") assert json["answer"] == "1…10" json = qmcall(c, {"q": "teldu hratt niður frá 4", "voice": True}, "Counting") assert json["answer"] == "3…0" assert "<break time=" in json["voice"] json = qmcall(c, {"q": "nennirðu að telja niður frá 24", "voice": True}, "Counting") assert json["answer"] == "23…0" json = qmcall(c, {"q": "teldu upp að 5000", "voice": True}, "Counting") assert len(json["voice"]) < 100 # Currency module json = qmcall(c, {"q": "hvert er gengi dönsku krónunnar?"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað kostar evran"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað kostar bandaríkjadalur mikið í krónum"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall( c, {"q": "Hvert er gengi krónunnar gagnvart dollara í dag?"}, "Currency" ) assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvert er gengi krónunnar á móti dollara í dag"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað eru tíu þúsund krónur margir dalir"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað eru 79 dollarar margar evrur?"}, "Currency") assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None # Date module SPECIAL_DAYS = ( "jólin", "gamlársdagur", "nýársdagur", "hvítasunna", "páskar", "þjóðhátíðardagurinn", "baráttudagur verkalýðsins", "öskudagur", "skírdagur", "sumardagurinn fyrsti", "verslunarmannahelgi", "þorláksmessa", "föstudagurinn langi", "menningarnótt", "sjómannadagurinn", "dagur íslenskrar tungu", "annar í jólum", "feðradagur", "mæðradagurinn", ) for d in SPECIAL_DAYS: qstr = "hvenær er " + d json = qmcall(c, {"q": qstr}, "Date") json = qmcall(c, {"q": "hver er dagsetningin?"}, "Date") assert json["answer"].endswith(datetime.now().strftime("%Y")) json = qmcall(c, {"q": "hvaða dagur er í dag?"}, "Date") assert json["answer"].endswith(datetime.now().strftime("%Y")) json = qmcall(c, {"q": "Hvað eru margir dagar til jóla?", "voice": True}, "Date") assert re.search(r"^\d+", json["answer"]) assert "dag" in json["voice"] json = qmcall(c, {"q": "Hvað eru margir dagar í 12. maí?"}, "Date") assert "dag" in json["answer"] or "á morgun" in json["answer"] # Tests to make sure this kind of query isn't caught by the distance module json = qmcall(c, {"q": "Hvað er langt í jólin?"}, "Date") json = qmcall(c, {"q": "Hvað er langt í páska?"}, "Date") now = datetime.utcnow() with changedlocale(category="LC_TIME"): # Today dstr = now.date().strftime("%-d. %B") json = qmcall(c, {"q": "Hvað eru margir dagar í " + dstr}) assert "í dag" in json["answer"] # Tomorrow dstr = (now.date() + timedelta(days=1)).strftime("%-d. %B") json = qmcall(c, {"q": "Hvað eru margir dagar í " + dstr}) assert "á morgun" in json["answer"] json = qmcall(c, {"q": "hvaða ár er núna?"}, "Date") assert str(now.year) in json["answer"] json = qmcall(c, {"q": "er hlaupár?"}, "Date") assert str(now.year) in json["answer"] json = qmcall(c, {"q": "er 2020 hlaupár?"}, "Date") assert "var hlaupár" in json["answer"] json = qmcall(c, {"q": "var árið 1999 hlaupár?"}, "Date") assert "ekki hlaupár" in json["answer"] json = qmcall(c, {"q": "hvað eru margir dagar í desember"}, "Date") assert json["answer"].startswith("31") assert "dag" in json["answer"] json = qmcall(c, {"q": "hvað eru margir dagar í febrúar 2024"}, "Date") assert json["answer"].startswith("29") assert "dag" in json["answer"] json = qmcall(c, {"q": "Hvað er langt fram að verslunarmannahelgi"}, "Date") assert re.search(r"^\d+", json["answer"]) # json = qmcall(c, {"q": "hvað er langt liðið frá uppstigningardegi"}, "Date") # assert re.search(r"^\d+", json["answer"]) json = qmcall(c, {"q": "hvenær eru jólin"}, "Date") assert re.search(r"25", json["answer"]) is not None # Dictionary module json = qmcall( c, {"q": "hvernig skilgreinir orðabókin orðið kettlingur"}, "Dictionary" ) assert "kettlingur" in json["answer"].lower() json = qmcall(c, {"q": "flettu upp orðinu skíthæll í orðabók"}, "Dictionary") assert "skíthæll" in json["answer"].lower() assert json["source"] == "Íslensk nútímamálsorðabók" # Distance module # NB: No Google API key on test server if google_key: json = qmcall( c, {"q": "hvað er ég langt frá perlunni", "voice": True}, "Distance" ) assert json["answer"].startswith("3,5 km") assert json["voice"].startswith("Perlan er ") assert json["source"] == "Google Maps" json = qmcall(c, {"q": "hvað er langt í melabúðina", "voice": True}, "Distance") assert json["answer"].startswith("1,") and "km" in json["answer"] assert json["voice"].startswith("Melabúðin er ") json = qmcall( c, {"q": "hvað er ég lengi að ganga í kringluna", "voice": True}, "Distance" ) assert json["key"] == "Kringlan" assert "klukkustund" in json["answer"] and " km" in json["answer"] assert json["voice"].startswith("Að ganga") json = qmcall( c, {"q": "hvað tekur langan tíma að keyra til akureyrar"}, "Distance" ) assert json["key"] == "Akureyri" assert "klukkustundir" in json["answer"] and " km" in json["answer"] assert json["answer"].endswith("(389 km).") # Flights module departure_pattern = r"^Flug \w*? til .*? flýgur frá \w*? \d+\. \w*? klukkan \d\d\:\d\d að staðartíma.$" arrival_pattern = r"^Flug \w*? frá .*? lendir [í|á] \w*? \d+\. \w*? klukkan \d\d\:\d\d að staðartíma.$" no_matching_flight_pattern = r"Ekkert flug fannst (frá .*? )?(til .*? )?næstu \d+ daga." json = qmcall( c, {"q": "hvenær fer næsta flug til jfk frá keflavík", "voice": True}, "Flights" ) assert re.search(departure_pattern, json["answer"]) json = qmcall( c, {"q": "hvenær flýgur næsta flug til new york frá keflavík", "voice": True}, "Flights" ) assert re.search(departure_pattern, json["answer"]) json = qmcall( c, {"q": "hvenær flýgur næsta flug af stað frá keflavík", "voice": True}, "Flights", ) assert re.search(departure_pattern, json["answer"]) json = qmcall( c, {"q": "hver er brottfarartími næsta flugs frá keflavík", "voice": True}, "Flights", ) assert re.search(departure_pattern, json["answer"]) json = qmcall( c, {"q": "hver er brottfarartíminn fyrir næsta flug frá keflavík", "voice": True}, "Flights", ) assert re.search(departure_pattern, json["answer"]) json = qmcall( c, {"q": "hvenær lendir næsta flug í keflavík", "voice": True}, "Flights" ) assert re.search(arrival_pattern, json["answer"]) json = qmcall( c, {"q": "hvenær kemur næsta vél á akureyri", "voice": True}, "Flights" ) assert re.search(arrival_pattern, json["answer"]) or re.search( no_matching_flight_pattern, json["answer"] ) # In case no flights to Akureyri json = qmcall( c, {"q": "hvenær mætir næsta vél á vopnafjörð", "voice": True}, "Flights" ) assert re.search(arrival_pattern, json["answer"]) or re.search( no_matching_flight_pattern, json["answer"] ) # In case no flights to Vopnafjörður json = qmcall( c, {"q": "hvenær mætir næsta vél til vopnafjarðar", "voice": True}, "Flights" ) assert re.search(arrival_pattern, json["answer"]) or re.search( no_matching_flight_pattern, json["answer"] ) # In case no flights to Vopnafjörður json = qmcall( c, { "q": "hver er lendingartími næstu vélar frá reykjavík til vopnafjarðar", "voice": True, }, "Flights", ) assert re.search(arrival_pattern, json["answer"]) or re.search( no_matching_flight_pattern, json["answer"] ) json = qmcall( c, { "q": "hver er lendingartíminn fyrir næsta flug til reykjavíkur frá akureyri", "voice": True, }, "Flights", ) assert re.search(arrival_pattern, json["answer"]) or re.search( no_matching_flight_pattern, json["answer"] ) json = qmcall( c, {"q": "hvenær fer næsta flug til blabla frá ekkitil", "voice": True}, "Flights", ) assert re.search(no_matching_flight_pattern, json["answer"]) json = qmcall( c, {"q": "hvenær fer næsta flug frá ekkitil til blablab", "voice": True}, "Flights", ) assert re.search(no_matching_flight_pattern, json["answer"]) # Geography module json = qmcall(c, {"q": "hver er höfuðborg spánar", "voice": True}, "Geography") assert json["answer"] == "Madríd" assert "Spánar" in json["voice"] # not 'Spáns', which was a bug json = qmcall(c, {"q": "Hver er höfuðborg taiwan"}, "Geography") assert json["answer"] == "Taípei" json = qmcall(c, {"q": "hver er höfuðborg norður-makedóníu"}, "Geography") assert json["answer"] == "Skopje" json = qmcall(c, {"q": "hver er höfuðborg norður kóreu"}, "Geography") assert json["answer"] == "Pjongjang" # json = qmcall( # c, {"q": "hver er höfuðborg sameinuðu arabísku furstadæmanna"}, "Geography" # ) # assert json["answer"] == "Abú Dabí" json = qmcall(c, {"q": "hvað er höfuðborgin í bretlandi"}, "Geography") assert json["answer"] == "Lundúnir" json = qmcall(c, {"q": "í hvaða landi er jóhannesarborg"}, "Geography") assert json["answer"].endswith("Suður-Afríku") json = qmcall(c, {"q": "í hvaða landi er kalifornía"}, "Geography") assert "Bandaríkjunum" in json["answer"] and json["key"] == "Kalifornía" json = qmcall(c, {"q": "í hvaða heimsálfu er míkrónesía"}, "Geography") assert json["answer"].startswith("Eyjaálfu") json = qmcall(c, {"q": "hvar í heiminum er máritanía"}, "Geography") assert "Afríku" in json["answer"] json = qmcall(c, {"q": "hvar er Kaupmannahöfn"}, "Geography") assert "Danmörku" in json["answer"] json = qmcall(c, {"q": "hvar er borgin tókýó"}, "Geography") assert "Japan" in json["answer"] # News module json = qmcall(c, {"q": "Hvað er í fréttum", "voice": True}, "News") assert len(json["answer"]) > 80 # This is always going to be a long answer assert json["voice"].startswith("Í fréttum rúv er þetta helst") json = qmcall(c, {"q": "Hvað er helst í fréttum", "voice": True}, "News") assert len(json["answer"]) > 80 # This is always going to be a long answer assert json["voice"].startswith("Í fréttum rúv er þetta helst") # Opinion module json = qmcall(c, {"q": "hvaða skoðun hefurðu á þriðja orkupakkanum"}, "Opinion") assert json["answer"].startswith("Ég hef enga sérstaka skoðun") assert json["key"] == "þriðji orkupakkinn" json = qmcall( c, {"q": "hvað finnst þér eiginlega um Katrínu Jakobsdóttur"}, "Opinion" ) assert json["answer"].startswith("Ég hef enga sérstaka skoðun") assert json["key"] == "Katrín Jakobsdóttir" json = qmcall(c, {"q": "hver er skoðun þín á blurghsmurgdurg"}, "Opinion") assert json["answer"].startswith("Ég hef enga sérstaka skoðun") assert json["key"] == "blurghsmurgdurg" # Petrol module json = qmcall(c, {"q": "Hvar er næsta bensínstöð", "voice": True}, "Petrol") assert "Ánanaust" in json["answer"] assert "source" in json and json["source"].startswith("Gasvaktin") json = qmcall( c, {"q": "Hvar fæ ég ódýrt bensín í nágrenninu", "voice": True}, "Petrol" ) assert "source" in json and json["source"].startswith("Gasvaktin") json = qmcall(c, {"q": "Hvar fæ ég ódýrasta bensínið"}, "Petrol") assert "source" in json and json["source"].startswith("Gasvaktin") json = qmcall(c, {"q": "hvar er bensínið ódýrast"}, "Petrol") assert "source" in json and json["source"].startswith("Gasvaktin") # Places module # NB: No Google API key on test server if google_key: json = qmcall(c, {"q": "Hvað er opið lengi í Melabúðinni"}, "Places") json = qmcall(c, {"q": "Er lokað á Forréttabarnum?"}, "Places") json = qmcall(c, {"q": "Hvenær opnar sundhöllin?"}, "Places") # Random module json = qmcall(c, {"q": "Veldu tölu milli sautján og 30"}, "Random") assert int(json["answer"]) >= 17 and int(json["answer"]) <= 30 json = qmcall(c, {"q": "veldu fyrir mig tölu milli 30 og þrjátíu"}, "Random") assert int(json["answer"]) == 30 json = qmcall(c, {"q": "kastaðu teningi"}, "Random") assert int(json["answer"]) >= 1 and int(json["answer"]) <= 6 json = qmcall(c, {"q": "kastaðu átta hliða teningi"}, "Random") assert int(json["answer"]) >= 1 and int(json["answer"]) <= 8 json = qmcall(c, {"q": "fiskur eða skjaldarmerki"}, "Random") a = json["answer"].lower() assert "fiskur" in a or "skjaldarmerki" in a json = qmcall(c, {"q": "kastaðu peningi"}, "Random") a = json["answer"].lower() assert "fiskur" in a or "skjaldarmerki" in a # Repeat module # NB: Disabled for now. # json = qmcall(c, {"q": "segðu setninguna simmi er bjálfi"}, "Parrot") # assert json["answer"] == "Simmi er bjálfi" # assert json["q"] == "Segðu setninguna „Simmi er bjálfi.“" json = qmcall(c, {"q": "segðu eitthvað skemmtilegt"}) assert json["qtype"] != "Parrot" # Schedules module json = qmcall(c, {"q": "hvað er í sjónvarpinu núna", "voice": True}, "Schedule") assert json["key"] == "TelevisionSchedule" json = qmcall(c, {"q": "Hvaða þáttur er eiginlega á rúv núna"}, "Schedule") assert json["key"] == "TelevisionSchedule" json = qmcall(c, {"q": "hvað er í sjónvarpinu í kvöld?"}, "Schedule") assert json["key"] == "TelevisionEvening" json = qmcall(c, {"q": "hver er sjónvarpsdagskráin í kvöld?"}, "Schedule") assert json["key"] == "TelevisionEvening" # json = qmcall(c, {"q": "hvað er í útvarpinu núna?"}, "Schedule") # assert json["qkey"] == "RadioSchedule" # json = qmcall(c, {"q": "hvað er eiginlega í gangi á rás eitt?"}, "Schedule") # assert json["qkey"] == "RadioSchedule" # json = qmcall(c, {"q": "hvað er á dagskrá á rás tvö?"}, "Schedule") # assert json["qkey"] == "RadioSchedule" # Special module json = qmcall(client, {"q": "Hver er sætastur?", "voice": True}, "Special") assert json["answer"] == "Tumi Þorsteinsson." assert json["voice"] == "Tumi Þorsteinsson er langsætastur." json = qmcall(client, {"q": "Hver er tilgangur lífsins?"}, "Special") assert json["answer"].startswith("42") # Stats module json = qmcall(c, {"q": "hversu marga einstaklinga þekkirðu?"}, "Stats") json = qmcall(c, {"q": "Hversu mörgum spurningum hefur þú svarað?"}, "Stats") json = qmcall(c, {"q": "hvað ertu aðallega spurð um?"}, "Stats") json = qmcall(c, {"q": "hvaða fólk er mest í fréttum"}, "Stats") # Telephone module json = qmcall(c, {"q": "Hringdu í síma 6 9 9 2 4 2 2"}, "Telephone") assert "open_url" in json assert json["open_url"] == "tel:6992422" assert json["q"].endswith("6992422") json = qmcall(c, {"q": "hringdu fyrir mig í númerið 69 92 42 2"}, "Telephone") assert "open_url" in json assert json["open_url"] == "tel:6992422" assert json["q"].endswith("6992422") json = qmcall(c, {"q": "vinsamlegast hringdu í 921-7422"}, "Telephone") assert "open_url" in json assert json["open_url"] == "tel:9217422" assert json["q"].endswith("9217422") json = qmcall(c, {"q": "hringdu í 26"}, "Telephone") assert "ekki gilt símanúmer" in json["answer"] # Time module json = qmcall(c, {"q": "hvað er klukkan í Kaupmannahöfn?", "voice": True}, "Time") assert json["key"] == "Europe/Copenhagen" assert re.search(r"^\d\d:\d\d$", json["answer"]) json = qmcall(c, {"q": "Hvað er klukkan núna", "voice": True}, "Time") assert json["key"] == "Atlantic/Reykjavik" assert re.search(r"^\d\d:\d\d$", json["answer"]) assert json["voice"].startswith("Klukkan er") json = qmcall(c, {"q": "Hvað er klukkan í Japan?", "voice": True}, "Time") assert json["key"] == "Asia/Tokyo" assert re.search(r"^\d\d:\d\d$", json["answer"]) assert json["voice"].lower().startswith("klukkan í japan er") # Unit module json = qmcall(c, {"q": "Hvað eru margir metrar í mílu?"}, "Unit") assert json["answer"] == "1.610 metrar" json = qmcall(c, {"q": "hvað eru margar sekúndur í tveimur dögum?"}, "Unit") assert json["answer"] == "173.000 sekúndur" json = qmcall(c, {"q": "hvað eru tíu steinar mörg kíló?"}, "Unit") assert json["answer"] == "63,5 kíló" json = qmcall(c, {"q": "hvað eru sjö vökvaúnsur margir lítrar"}, "Unit") assert json["answer"] == "0,21 lítrar" json = qmcall(c, {"q": "hvað eru 18 merkur mörg kíló"}, "Unit") assert json["answer"] == "4,5 kíló" json = qmcall(c, {"q": "hvað eru mörg korter í einum degi"}, "Unit") assert json["answer"].startswith("96") json = qmcall(c, {"q": "hvað eru margar mínútur í einu ári"}, "Unit") assert json["answer"].startswith("526.000 mínútur") # User info module json = qmcall( c, {"q": "ég heiti Gunna Jónsdóttir", "client_id": DUMMY_CLIENT_ID}, "UserInfo", ) assert json["answer"].startswith("Sæl og blessuð") and "Gunna" in json["answer"] json = qmcall(c, {"q": "hvað heiti ég", "client_id": DUMMY_CLIENT_ID}) assert "Gunna Jónsdóttir" in json["answer"] json = qmcall( c, {"q": "Nafn mitt er Gunnar", "client_id": DUMMY_CLIENT_ID}, "UserInfo" ) assert json["answer"].startswith("Sæll og blessaður") and "Gunnar" in json["answer"] json = qmcall( c, {"q": "veistu hvað ég heiti", "client_id": DUMMY_CLIENT_ID}, "UserInfo" ) assert json["answer"].startswith("Þú heitir Gunnar") json = qmcall(c, {"q": "ég heiti Boutros Boutros-Ghali"}, "UserInfo") assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] json = qmcall( c, { "q": "hvaða útgáfu er ég að keyra", "client_type": "ios", "client_version": "1.1.0", "voice": True, }, ) assert "iOS" in json["answer"] and "1.1.0" in json["answer"] assert "komma" in json["voice"] json = qmcall(c, {"q": "á hvaða tæki ertu að keyra?", "client_type": "ios"}) assert "iOS" in json["answer"] # json = qmcall( # c, # {"q": "ég á heima á öldugötu 4 í reykjavík", "client_id": DUMMY_CLIENT_ID}, # "UserInfo", # ) # assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] # json = qmcall(c, {"q": "hvar á ég heima"}, "UserInfo") # assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] # json = qmcall(c, {"q": "ég á heima á Fiskislóð 31"}, "UserInfo") # assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] # json = qmcall(c, {"q": "hvar bý ég eiginlega"}, "UserInfo") # assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] # User location module # NB: No Google API key on test server if google_key: json = qmcall(c, {"q": "Hvar er ég"}, "UserLocation") assert "Fiskislóð 31" in json["answer"] json = qmcall( c, {"q": "Hvar í heiminum er ég eiginlega staddur?"}, "UserLocation" ) assert "Fiskislóð 31" in json["answer"] # Weather module json = qmcall(c, {"q": "hvernig er veðrið í Reykjavík?"}, "Weather") assert re.search(r"^\-?\d+ °C", json["answer"]) is not None json = qmcall(c, {"q": "Hversu hlýtt er úti?"}, "Weather") assert re.search(r"^\-?\d+ °C", json["answer"]) is not None json = qmcall(c, {"q": "hversu kalt er í dag?"}, "Weather") assert re.search(r"^\-?\d+ °C", json["answer"]) is not None json = qmcall(c, {"q": "hver er veðurspáin?"}, "Weather") json = qmcall(c, {"q": "hver er veðurspáin fyrir morgundaginn"}, "Weather") assert len(json["answer"]) > 20 and "." in json["answer"] # Wikipedia module json = qmcall(c, {"q": "Hvað segir wikipedia um Jón Leifs?"}, "Wikipedia") assert "Wikipedía" in json["q"] # Make sure it's being beautified assert "tónskáld" in json["answer"] assert "source" in json and "wiki" in json["source"].lower() json = qmcall(c, {"q": "hvað segir vikipedija um jóhann sigurjónsson"}, "Wikipedia") assert "Jóhann" in json["answer"] json = qmcall(c, {"q": "fræddu mig um berlín"}, "Wikipedia") assert "Berlín" in json["answer"] json = qmcall( c, { "q": "katrín Jakobsdóttir í vikipediju", "client_id": DUMMY_CLIENT_ID, "private": False, }, "Wikipedia", ) assert "Katrín Jakobsdóttir" in json["answer"] json = qmcall( c, {"q": "hvað segir wikipedía um hana", "client_id": DUMMY_CLIENT_ID}, "Wikipedia", ) assert "Katrín Jakobsdóttir" in json["answer"] # Words module json = qmcall( c, {"q": "hvernig stafar maður orðið hestur", "voice": True}, "Spelling" ) assert json["answer"] == "H E S T U R" assert json["voice"].startswith("Orðið „hestur“ ") json = qmcall(c, {"q": "hvernig beygist orðið maður", "voice": True}, "Declension") assert json["answer"].lower() == "maður, mann, manni, manns" assert json["voice"].startswith("Orðið „maður“") json = qmcall(c, {"q": "hvernig beygir maður nafnorðið splorglobb?", "voice": True}) assert json["voice"].startswith("Nafnorðið „splorglobb“ fannst ekki") # Yule lads module json = qmcall( c, {"q": "hvenær kemur fyrsti jólasveinninn til byggða", "voice": True}, "YuleLads", ) # Delete any queries or query data logged as result of these tests with SessionContext(commit=True) as session: session.execute( Query.table().delete().where(Query.client_id == DUMMY_CLIENT_ID) ) session.execute( QueryData.table().delete().where(QueryData.client_id == DUMMY_CLIENT_ID) )
def _process_result(result: Result) -> Dict[str, str]: """ Return formatted description of arrival/departure time of flights to or from an Icelandic airport, based on info in result dict. """ airport: str # Icelandic or foreign airport/country api_airport: str # Always an Icelandic airport, as the ISAVIA API only covers them departing: bool = result["departure"] if departing: # Departures (from Keflavík by default) api_airport = result.get("from_loc", "keflavík").lower() # Wildcard matches any flight (if airport wasn't specified) airport = result.get("to_loc", "*").lower() else: # Arrivals (to Keflavík by default) api_airport = result.get("to_loc", "keflavík").lower() airport = result.get("from_loc", "*").lower() from_date: datetime to_date: datetime days: int = result.get("day_count", 5) # Check 5 days into future by default from_date = result.get("from_date", datetime.now(timezone.utc)) to_date = result.get("to_date", datetime.now(timezone.utc) + timedelta(days=days)) # Normalize airport/city names airport = _LOCATION_ABBREV_MAP.get(airport, airport) airport = NounPhrase(airport).nominative or airport api_airport = _LOCATION_ABBREV_MAP.get(api_airport, api_airport) api_airport = NounPhrase(api_airport).nominative or api_airport # Translate Icelandic airport to its IATA code iata_code: str = _AIRPORT_TO_IATA_MAP.get(api_airport, api_airport) # TODO: Currently module only fetches one flight, # modifications to the grammar could allow fetching of more flights at once flight_count: int = result.get("flight_count", 1) flight_data: FlightList # Check first if function result in cache, else fetch data from API if departing in _FLIGHT_CACHE: flight_data = _FLIGHT_CACHE[departing] else: flight_data = _fetch_flight_data(from_date, to_date, iata_code, departing) flight_data = _filter_flight_data(flight_data, airport, api_airport, flight_count) answ: Dict[str, str] = dict() if len(flight_data) > 0: # (Format month names in Icelandic) with changedlocale(category="LC_TIME"): answ = _format_flight_answer(flight_data) else: to_airp: str from_airp: str if departing: to_airp, from_airp = airport, api_airport else: from_airp, to_airp = airport, api_airport to_airp = icelandic_city_name(capitalize_placename(to_airp)) from_airp = icelandic_city_name(capitalize_placename(from_airp)) from_airp = NounPhrase(from_airp).dative or from_airp to_airp = NounPhrase(to_airp).genitive or to_airp if from_airp == "*": answ["answer"] = f"Ekkert flug fannst til {to_airp} næstu {days} daga." elif to_airp == "*": answ["answer"] = f"Ekkert flug fannst frá {from_airp} næstu {days} daga." else: answ["answer"] = ( f"Ekkert flug fannst " f"frá {from_airp} " f"til {to_airp} " f"næstu {days} daga." ) answ["voice"] = answ["answer"] return answ
def test_query_api(client): """ Make various query API calls and validate response. """ c = client # Arithmetic module ARITHM_QUERIES = { "hvað er fimm sinnum tólf": "60", "hvað er 12 sinnum 12?": "144", "hvað er nítján plús 3": "22", "hvað er hundrað mínus sautján": "83", "hvað er 17 deilt með fjórum": "4,25", "hver er kvaðratrótin af 256": "16", "hvað er 12 í þriðja veldi": "1728", "hvað eru tveir í tíunda veldi": "1024", "hvað eru 17 prósent af 20": "3,4", "hvað er 7000 deilt með 812": "8,62", "hvað er þrisvar sinnum sjö": "21", "hvað er fjórðungur af 28": "7", "hvað er einn tuttugasti af 192": "9,6", "reiknaðu 7 sinnum 7": "49", "geturðu reiknað kvaðratrótina af 9": "3", "hvað er 8900 með vaski": "11.036", "hvað eru 7500 krónur með virðisaukaskatti": "9.300", } for q, a in ARITHM_QUERIES.items(): json = qmcall(c, {"q": q, "voice": True}) assert json["qtype"] == "Arithmetic" assert json["answer"] == a json = qmcall(c, {"q": "hvað er pí", "client_id": DUMMY_CLIENT_ID, "private": False}) assert "π" in json["answer"] assert json["qtype"] == "PI" assert "3,14159" in json["answer"] json = qmcall(c, {"q": "hvað er það sinnum tveir", "client_id": DUMMY_CLIENT_ID, "private": False}) assert json["qtype"] == "Arithmetic" assert json["answer"].startswith("6,") # Person and entity title queries are tested using a dummy database # populated with data from CSV files stored in tests/test_files/testdb_*.csv # Builtin module: title json = qmcall(c, {"q": "hver er viðar þorsteinsson", "voice": True}) assert json["qtype"] == "Person" assert json["voice"].startswith("Viðar Þorsteinsson er ") assert json["voice"].endswith(".") # Builtin module: title json = qmcall(c, {"q": "hver er björn þorsteinsson", "voice": True}) assert json["qtype"] == "Person" assert json["voice"].startswith("Björn Þorsteinsson er ") assert json["voice"].endswith(".") # Builtin module: person json = qmcall(c, {"q": "hver er forsætisráðherra", "voice": True}) assert json["qtype"] == "Title" assert json["voice"].startswith("Forsætisráðherra er ") assert json["voice"].endswith(".") # Bus module json = qmcall(c, {"q": "hvaða stoppistöð er næst mér", "voice": True}) assert json["qtype"] == "NearestStop" assert json["answer"] == "Fiskislóð" assert json["voice"] == "Næsta stoppistöð er Fiskislóð; þangað eru 310 metrar." json = qmcall( c, {"q": "hvenær er von á vagni númer 17", "voice": True, "test": False} ) assert json["qtype"] == "ArrivalTime" assert json["answer"] == "Staðsetning óþekkt" # No location info available # Counting module json = qmcall(c, {"q": "teldu frá einum upp í tíu"}) assert json["qtype"] == "Counting" assert json["answer"] == "1…10" json = qmcall(c, {"q": "teldu hratt niður frá 4", "voice": True}) assert json["qtype"] == "Counting" assert json["answer"] == "3…0" assert "<break time=" in json["voice"] json = qmcall(c, {"q": "teldu upp að 5000", "voice": True}) assert json["qtype"] == "Counting" assert len(json["voice"]) < 100 # Currency module json = qmcall(c, {"q": "Hvert er gengi dönsku krónunnar?"}) assert json["qtype"] == "Currency" assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað kostar evran"}) assert json["qtype"] == "Currency" assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "Hvert er gengi krónunnar gagnvart dollara í dag?"}) assert json["qtype"] == "Currency" assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað eru tíu þúsund krónur margir dollarar"}) assert json["qtype"] == "Currency" assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None json = qmcall(c, {"q": "hvað eru 79 dollarar margir evrur?"}) assert json["qtype"] == "Currency" assert re.search(r"^\d+(,\d+)?$", json["answer"]) is not None # Date module SPECIAL_DAYS = ( "jólin", "gamlársdagur", "nýársdagur", "hvítasunna", "páskar", "þjóðhátíðardagurinn", "baráttudagur verkalýðsins", "öskudagur", "skírdagur", "sumardagurinn fyrsti", "verslunarmannahelgi", "þorláksmessa", "föstudagurinn langi", "menningarnótt", "sjómannadagurinn", "dagur íslenskrar tungu", "annar í jólum", "feðradagur", "mæðradagurinn", ) for d in SPECIAL_DAYS: qstr = "hvenær er " + d json = qmcall(c, {"q": qstr}) assert json["qtype"] == "Date" json = qmcall(c, {"q": "Hver er dagsetningin?"}) assert json["qtype"] == "Date" assert json["answer"].endswith(datetime.now().strftime("%Y")) json = qmcall(c, {"q": "Hvað eru margir dagar til jóla?", "voice": True}) assert json["qtype"] == "Date" assert re.search(r"^\d+", json["answer"]) assert "dag" in json["voice"] json = qmcall(c, {"q": "Hvað eru margir dagar í 12. maí?"}) assert json["qtype"] == "Date" assert "dag" in json["answer"] or "á morgun" in answer now = datetime.utcnow() with changedlocale(category="LC_TIME"): # Today dstr = now.date().strftime("%-d. %B") json = qmcall(c, {"q": "Hvað eru margir dagar í " + dstr}) assert "í dag" in json["answer"] # Tomorrow dstr = (now.date() + timedelta(days=1)).strftime("%-d. %B") json = qmcall(c, {"q": "Hvað eru margir dagar í " + dstr}) assert "á morgun" in json["answer"] json = qmcall(c, {"q": "hvaða ár er núna?"}) assert json["qtype"] == "Date" assert str(now.year) in json["answer"] json = qmcall(c, {"q": "er hlaupár?"}) assert json["qtype"] == "Date" assert str(now.year) in json["answer"] json = qmcall(c, {"q": "er 2020 hlaupár?"}) assert json["qtype"] == "Date" assert "er hlaupár" in json["answer"] json = qmcall(c, {"q": "var árið 1999 hlaupár?"}) assert json["qtype"] == "Date" assert "ekki hlaupár" in json["answer"] json = qmcall(c, {"q": "hvað eru margir dagar í desember"}) assert json["qtype"] == "Date" assert json["answer"].startswith("31") assert "dag" in json["answer"] json = qmcall(c, {"q": "hvað eru margir dagar í febrúar 2024"}) assert json["qtype"] == "Date" assert json["answer"].startswith("29") assert "dag" in json["answer"] json = qmcall(c, {"q": "Hvað er langt fram að verslunarmannahelgi"}) assert json["qtype"] == "Date" assert re.search(r"^\d+", json["answer"]) # json = qmcall(c, {"q": "hvað er langt liðið frá uppstigningardegi"}) # assert json["qtype"] == "Date" # assert re.search(r"^\d+", json["answer"]) json = qmcall(c, {"q": "hvenær eru jólin"}) assert json["qtype"] == "Date" assert re.search(r"25", json["answer"]) is not None # Distance module # NB: No Google API key on test server # json = qmcall(c, {"q": "Hvað er ég langt frá Perlunni", "voice": True}) # assert json["qtype"] == "Distance" # assert json["answer"].startswith("3,5 km") # assert json["voice"].startswith("Perlan er ") # assert json["source"] == "Google Maps" # json = qmcall(c, {"q": "hvað er langt í melabúðina", "voice": True}) # assert json["qtype"] == "Distance" # assert json["answer"].startswith("1,4 km") # assert json["voice"].startswith("Melabúðin er ") # Flights module # TODO: Implement me! # Geography module json = qmcall(c, {"q": "Hver er höfuðborg Spánar?"}) assert json["qtype"] == "Geography" assert json["answer"] == "Madríd" json = qmcall(c, {"q": "hver er höfuðborg norður-makedóníu?"}) assert json["qtype"] == "Geography" assert json["answer"] == "Skopje" json = qmcall(c, {"q": "Hvað er höfuðborgin í Bretlandi"}) assert json["qtype"] == "Geography" assert json["answer"] == "Lundúnir" json = qmcall(c, {"q": "Í hvaða landi er Jóhannesarborg?"}) assert json["qtype"] == "Geography" assert json["answer"].endswith("Suður-Afríku") json = qmcall(c, {"q": "Í hvaða heimsálfu er míkrónesía?"}) assert json["qtype"] == "Geography" assert json["answer"].startswith("Eyjaálfu") json = qmcall(c, {"q": "Hvar er máritanía?"}) assert json["qtype"] == "Geography" assert "Afríku" in json["answer"] json = qmcall(c, {"q": "Hvar er Kaupmannahöfn?"}) assert json["qtype"] == "Geography" assert "Danmörku" in json["answer"] # Intro module json = qmcall(c, {"q": "ég heiti Gunna"}) assert json["qtype"] == "Introduction" assert json["answer"].startswith("Sæl og blessuð") json = qmcall(c, {"q": "ég heiti Gunnar"}) assert json["qtype"] == "Introduction" assert json["answer"].startswith("Sæll og blessaður") json = qmcall(c, {"q": "ég heiti Boutros Boutros-Ghali"}) assert json["qtype"] == "Introduction" assert json["answer"].startswith("Gaman að kynnast") and "Boutros" in json["answer"] # Location module # NB: No Google API key on test server # json = qmcall(c, {"q": "Hvar er ég", "latitude": 64.15673429618045, "longitude": -21.9511777069624}) # assert json["qtype"] == "Location" # assert json["answer"].startswith("Fiskislóð 31") # News module json = qmcall(c, {"q": "Hvað er í fréttum", "voice": True}) assert json["qtype"] == "News" assert len(json["answer"]) > 80 # This is always going to be a long answer assert json["voice"].startswith("Í fréttum rúv er þetta helst") # Opinion module json = qmcall(c, {"q": "Hvað finnst þér um loftslagsmál?"}) assert json["qtype"] == "Opinion" assert json["answer"].startswith("Ég hef enga sérstaka skoðun") json = qmcall(c, {"q": "hvaða skoðun hefurðu á þriðja orkupakkanum"}) assert json["qtype"] == "Opinion" assert json["answer"].startswith("Ég hef enga sérstaka skoðun") # Petrol module json = qmcall(c, {"q": "Hvar er næsta bensínstöð", "voice": True}) assert json["qtype"] == "Petrol" assert "Ánanaust" in json["answer"] assert "source" in json and json["source"].startswith("Gasvaktin") json = qmcall(c, {"q": "Hvar fæ ég ódýrt bensín í nágrenninu", "voice": True}) assert json["qtype"] == "Petrol" assert "source" in json and json["source"].startswith("Gasvaktin") json = qmcall(c, {"q": "Hvar fæ ég ódýrasta bensínið"}) assert json["qtype"] == "Petrol" assert "source" in json and json["source"].startswith("Gasvaktin") # Places module # TODO: Implement me! # Random module json = qmcall(c, {"q": "Veldu tölu milli sautján og 30"}) assert json["qtype"] == "Random" assert int(json["answer"]) >= 17 and int(json["answer"]) <= 30 json = qmcall(c, {"q": "kastaðu teningi"}) assert json["qtype"] == "Random" assert int(json["answer"]) >= 1 and int(json["answer"]) <= 6 json = qmcall(c, {"q": "kastaðu átta hliða teningi"}) assert json["qtype"] == "Random" assert int(json["answer"]) >= 1 and int(json["answer"]) <= 8 json = qmcall(c, {"q": "fiskur eða skjaldarmerki"}) assert json["qtype"] == "Random" a = json["answer"].lower() assert "fiskur" in a or "skjaldarmerki" in a json = qmcall(c, {"q": "kastaðu peningi"}) assert json["qtype"] == "Random" a = json["answer"].lower() assert "fiskur" in a or "skjaldarmerki" in a # Special module json = qmcall(client, {"q": "Hver er sætastur?", "voice": True}) assert json["qtype"] == "Special" assert json["answer"] == "Tumi Þorsteinsson." assert json["voice"] == "Tumi Þorsteinsson er langsætastur." # Stats module json = qmcall(c, {"q": "hversu marga einstaklinga þekkirðu?"}) assert json["qtype"] == "Stats" json = qmcall(c, {"q": "Hversu mörgum spurningum hefur þú svarað?"}) assert json["qtype"] == "Stats" json = qmcall(c, {"q": "hvað ertu aðallega spurð um?"}) assert json["qtype"] == "Stats" # Telephone module json = qmcall(c, {"q": "Hringdu í síma 6 9 9 2 4 2 2"}) assert json["qtype"] == "Telephone" assert "open_url" in json assert json["open_url"] == "tel:6992422" assert json["q"].endswith("6992422") json = qmcall(c, {"q": "hringdu fyrir mig í númerið 69 92 42 2"}) assert json["qtype"] == "Telephone" assert "open_url" in json assert json["open_url"] == "tel:6992422" assert json["q"].endswith("6992422") json = qmcall(c, {"q": "vinsamlegast hringdu í 921-7422"}) assert json["qtype"] == "Telephone" assert "open_url" in json assert json["open_url"] == "tel:9217422" assert json["q"].endswith("9217422") # Time module json = qmcall(c, {"q": "hvað er klukkan í Kaupmannahöfn?", "voice": True}) assert json["qtype"] == "Time" assert json["key"] == "Europe/Copenhagen" assert re.search(r"^\d\d:\d\d$", json["answer"]) json = qmcall(c, {"q": "Hvað er klukkan núna", "voice": True}) assert json["qtype"] == "Time" assert json["key"] == "Atlantic/Reykjavik" assert re.search(r"^\d\d:\d\d$", json["answer"]) assert json["voice"].startswith("Klukkan er") json = qmcall(c, {"q": "Hvað er klukkan í Japan?", "voice": True}) assert json["qtype"] == "Time" assert json["key"] == "Asia/Tokyo" assert re.search(r"^\d\d:\d\d$", json["answer"]) assert json["voice"].lower().startswith("klukkan í japan er") # Schedules module json = qmcall(c, {"q": "hvað er í sjónvarpinu núna", "voice": True}) assert json["qtype"] == "Schedule" json = qmcall(c, {"q": "Hvaða þáttur er eiginlega á rúv núna"}) assert json["qtype"] == "Schedule" json = qmcall(c, {"q": "hvað er í sjónvarpinu í kvöld?"}) assert json["qtype"] == "Schedule" json = qmcall(c, {"q": "hver er sjónvarpsdagskráin í kvöld?"}) assert json["qtype"] == "Schedule" # json = qmcall(c, {"q": "hvað er eiginlega í gangi á rás eitt?"}) # assert json["qtype"] == "Schedule" # json = qmcall(c, {"q": "hvað er á daskrá á rás 2?"}) # assert json["qtype"] == "Schedule" # json = qmcall(c, {"q": "Hvað er í sjónvarpinu núna í kvöld?"}) # assert json["qtype"] == "TelevisionEvening" # Unit module json = qmcall(c, {"q": "Hvað eru margir metrar í mílu?"}) assert json["qtype"] == "Unit" assert json["answer"] == "1.610 metrar" json = qmcall(c, {"q": "hvað eru margar sekúndur í tveimur dögum?"}) assert json["qtype"] == "Unit" assert json["answer"] == "173.000 sekúndur" json = qmcall(c, {"q": "hvað eru tíu steinar mörg kíló?"}) assert json["qtype"] == "Unit" assert json["answer"] == "63,5 kíló" json = qmcall(c, {"q": "hvað eru sjö vökvaúnsur margir lítrar"}) assert json["qtype"] == "Unit" assert json["answer"] == "0,21 lítrar" json = qmcall(c, {"q": "hvað eru 18 merkur mörg kíló"}) assert json["qtype"] == "Unit" assert json["answer"] == "4,5 kíló" json = qmcall(c, {"q": "hvað eru mörg korter í einum degi"}) assert json["qtype"] == "Unit" assert json["answer"].startswith("96") json = qmcall(c, {"q": "hvað eru margar mínútur í einu ári"}) assert json["qtype"] == "Unit" assert json["answer"].startswith("526.000 mínútur") # Weather module json = qmcall(c, {"q": "hvernig er veðrið í Reykjavík?"}) assert json["qtype"] == "Weather" assert re.search(r"^\-?\d+°", json["answer"]) is not None json = qmcall(c, {"q": "Hversu hlýtt er úti?"}) assert json["qtype"] == "Weather" assert re.search(r"^\-?\d+°", json["answer"]) is not None json = qmcall(c, {"q": "hver er veðurspáin fyrir morgundaginn"}) assert json["qtype"] == "Weather" assert len(json["answer"]) > 20 and "." in json["answer"] # Wikipedia module json = qmcall(c, {"q": "Hvað segir wikipedia um Jón Leifs?"}) assert json["qtype"] == "Wikipedia" assert "Wikipedía" in json["q"] # Make sure it's being beautified assert "tónskáld" in json["answer"] assert "source" in json and "wiki" in json["source"].lower() json = qmcall(c, {"q": "hvað segir vikipedija um jóhann sigurjónsson"}) assert json["qtype"] == "Wikipedia" assert "Jóhann" in json["answer"] json = qmcall(c, {"q": "fræddu mig um berlín"}) assert json["qtype"] == "Wikipedia" assert "Berlín" in json["answer"] json = qmcall(c, {"q": "katrín Jakobsdóttir í vikipediju", "client_id": DUMMY_CLIENT_ID, "private": False}) assert json["qtype"] == "Wikipedia" assert "Katrín Jakobsdóttir" in json["answer"] json = qmcall(c, {"q": "hvað segir wikipedía um hana", "client_id": DUMMY_CLIENT_ID, "private": False}) assert json["qtype"] == "Wikipedia" assert "Katrín Jakobsdóttir" in json["answer"] # Words module json = qmcall(c, {"q": "hvernig stafar maður orðið hestur", "voice": True}) assert json["qtype"] == "Spelling" assert json["answer"] == "H E S T U R" assert json["voice"].startswith("Orðið 'hestur'") json = qmcall(c, {"q": "hvernig beygist orðið maður", "voice": True}) assert json["qtype"] == "Declension" assert json["answer"].lower() == "maður, mann, manni, manns" assert json["voice"].startswith("Orðið 'maður'")
def wordfreq(): """ Return word frequency chart data for a given time period. """ resp: Dict[str, Any] = dict(err=True) # Create datetime objects from query string args try: date_fmt = "%Y-%m-%d" date_from = datetime.strptime(request.args.get("date_from", ""), date_fmt) date_to = datetime.strptime(request.args.get("date_to", ""), date_fmt) except Exception as e: logging.warning("Failed to parse date arg: {0}".format(e)) return better_jsonify(**resp) # Words param should contain one or more comma-separated word # lemmas with optional category specified with :cat suffix warg = request.args.get("words") if not warg: return better_jsonify(**resp) # Create word/cat pair from token def cat4token(t: Tok) -> Tuple[str, str]: assert t.kind in (TOK.WORD, TOK.PERSON, TOK.ENTITY) # TODO: Use GreynirPackage lemma lookup function for this w, cat = t.txt, "" if t.kind == TOK.WORD: val = list(filter(lambda m: m.stofn == m.ordmynd, t.meanings)) or t.meanings cat = val[0].ordfl if len(val) else CAT_UNKNOWN w = val[0].stofn if len(val) else t.txt # Hack to fix combined word, remove hyphens added by combinator if w.count("-") > t.txt.count("-"): san = "" txtlen = len(t.txt) for i, char in enumerate(w): if char == "-" and i < txtlen and t.txt[i] != "-": continue san += char w = san elif t.kind == TOK.PERSON: cat = "person_" + (t.person_names[0].gender or "hk") elif t.kind == TOK.ENTITY: cat = "entity" return (w, cat) # Parse arg string into word/cat tuples wds = _str2words(warg) # Try to tokenize each item that doesn't have a category nwds = [] for w, c in wds: if c is None or c == CAT_UNKNOWN: # Try to tokenize tokens = list( filter(lambda x: x.kind in _VALID_TOKENS, tokenize(w))) for t in tokens: nwds.append(cat4token(t)) else: nwds.append((w, c)) # Filter all words not in allowed category and restrict no. words words = list(filter(lambda x: x[1] in _VALID_WCATS, nwds)) words = words[:_MAX_NUM_WORDS] # Generate date labels now = datetime.utcnow() delta = date_to - date_from with changedlocale(category="LC_TIME"): # Group by week if period longer than 3 months label_date_strings: List[Union[str, Tuple[str, str]]] = [] if delta.days >= _SHOW_WEEKS_CUTOFF: timeunit = "week" label_dates = [( (date_from + timedelta(days=i * 7)), (date_from + timedelta(days=(i * 7) + 6)), ) for i in range(int((delta.days + 1) / 7))] # Construct elegant week date labels w. no superfluous information labels = [] for (d1, d2) in label_dates: if d1.month == d2.month: d1fmt = "%-d." d2fmt = "%-d. %b" else: d1fmt = d2fmt = "%-d. %b" if d1.year != now.year and d1.year != d2.year: d1fmt += " %Y" if d2.year != now.year: d2fmt += " %Y" labels.append("{0}-{1}".format(d1.strftime(d1fmt), d2.strftime(d2fmt))) # Convert dates to strings for client-side label_date_strings = [(df.strftime("%Y-%m-%d"), dt.strftime("%Y-%m-%d")) for df, dt in label_dates] # Group by day else: timeunit = "day" label_days = [ date_from + timedelta(days=i) for i in range(delta.days) ] labels = [ d.strftime("%-d. %b") if d.year == now.year else d.strftime("%-d. %b %Y") for d in label_days ] label_date_strings = [d.strftime("%Y-%m-%d") for d in label_days] # Create datasets for front-end chart colors = list(_LINE_COLORS) data: Dict[str, Any] = dict(labels=labels, labelDates=label_date_strings, datasets=[]) with SessionContext(commit=False) as session: for w in words: # Look up frequency of word for the given period (wd, cat) = w res = WordFrequencyQuery.frequency( wd, cat, date_from, date_to, timeunit=timeunit, enclosing_session=session, ) # Generate data and config for chart label = "{0} ({1})".format(wd, CAT_DESC.get(cat)) ds: Dict[str, Any] = dict(label=label, fill=False, lineTension=0) ds["borderColor"] = ds["backgroundColor"] = colors.pop(0) ds["data"] = [r[1] for r in res] ds["word"] = "{0}:{1}".format(wd, cat) data["datasets"].append(ds) # Create response resp["err"] = False resp["data"] = data resp["words"] = _words2str(words) return better_jsonify(**resp)
def read(self, fname, verbose = False, write_binary = True): """ Read grammar from a text file. Set verbose = True to get diagnostic messages about unused nonterminals and nonterminals that are unreachable from the root. Set write_binary = False to avoid writing a fresh binary file if the grammar text file is newer than the existing binary file. """ # Clear previous file info, if any self._file_time = self._file_name = None # Shortcuts terminals = self._terminals nonterminals = self._nonterminals grammar = self._nt_dict # The number of the current line in the grammar file line = 0 # Reset the sequence of production indices Production.reset() # Dictionary of variants, keyed by variant name # where the values are lists of variant options (strings) variants = OrderedDict() current_line = "" def parse_line(s): s = s.strip() if not s: # Blank line: ignore return def _parse_rhs(nt_id, vts, s, priority): """ Parse a right-hand side sequence, eventually with relative priority within the nonterminal """ def _add_rhs(nt_id, rhs, priority = 0): """ Add a fully expanded right-hand-side production to a nonterminal rule """ nt = nonterminals[nt_id] if nt not in grammar: # First production of this nonterminal grammar[nt] = [ ] if rhs is None else [ (priority, rhs) ] return if rhs is None: return if rhs.is_empty: # Adding epsilon production: avoid multiple ones if any(p.is_empty for _, p in grammar[nt]): return # Append to the list of productions of this nonterminal grammar[nt].append((priority, rhs)) s = s.strip() if not s: raise GrammarError("Invalid syntax for production", fname, line) tokens = s.split() # rhs is a list of tuples, one for each token, as follows: # (id, repeat, variants) rhs = [] # vfree is a set of 'free variants', i.e. variants that # occur in the right hand side of the production but not in # the nonterminal (those are in vts) vfree = set() for r in tokens: if r == "0": # Empty (epsilon) production if len(tokens) != 1: raise GrammarError("Empty (epsilon) rule must be of the form NT -> 0", fname, line) rhs.append((None, None, None)) break # Check for repeat/conditionality repeat = None if r[-1] in '*+?': # Optional repeat/conditionality specifier # Asterisk: Can be repeated 0 or more times # Plus: Can be repeated 1 or more times # Question mark: optionally present once repeat = r[-1] r = r[0:-1] # Check for variant specs v = r.split('/') r = v[0] v = v[1:] if not v: v = None else: for vspec in v: # if vspec not in vts: if vspec not in variants: raise GrammarError("Unknown variant '{0}'".format(vspec), fname, line) if vspec not in vts: # Free variant: add to set vfree.add(vspec) if r[0] in "\"'": # Literal terminal symbol if len(r) < 3 or r[0] not in r[2:]: raise GrammarError("Invalid literal terminal {0}".format(r), fname, line) else: # Identifier of nonterminal or terminal if not r.isidentifier(): raise GrammarError("Invalid identifier '{0}'".format(r), fname, line) rhs.append((r, repeat, v)) assert len(rhs) == len(tokens) # Generate productions for all variants def variant_values(vlist): """ Returns a list of names with all applicable variant options appended """ if not vlist: yield [ "" ] return if len(vlist) == 1: for vopt in variants[vlist[0]]: yield [ vopt ] return for v in variant_values(vlist[1:]): for vopt in variants[vlist[0]]: yield [ vopt ] + v # Make a list of all variants that occur in the # nonterminal or on the right hand side vall = vts + list(vfree) for vval in variant_values(vall): # Generate a production for every variant combination # Calculate the nonterminal suffix for this variant # combination nt_suffix = "_".join(vval[vall.index(vx)] for vx in vts) if vts else "" if nt_suffix: nt_suffix = "_" + nt_suffix result = Production(fname, line, priority = priority) for r, repeat, v in rhs: # Calculate the token suffix, if any # This may be different from the nonterminal suffix as # the token may have fewer variants than the nonterminal, # and/or free ones that don't appear in the nonterminal. if r is None: # Epsilon n = None else: suffix = "_".join(vval[vall.index(vx)] for vx in v) if v else "" if suffix: suffix = "_" + suffix sym = r + suffix if r[0] in "'\"": # Literal terminal if sym not in terminals: terminals[sym] = self._make_literal_terminal(sym) n = terminals[sym] elif r[0].isupper(): # Identifier of nonterminal if sym not in nonterminals: nonterminals[sym] = self._make_nonterminal(sym, fname, line) n = nonterminals[sym] n.add_ref() # Note that the nonterminal has been referenced else: # Identifier of terminal if sym not in terminals: terminals[sym] = self._make_terminal(sym) n = terminals[sym] # If the production item can be repeated, # create a new production and substitute. # A -> B C* D becomes: # A -> B C_new_* D # C_new_* -> C_new_* C | 0 # A -> B C+ D becomes: # A -> B C_new_+ D # C_new_+ -> C_new_+ C | C # A -> B C? D becomes: # A -> B C_new_? D # C_new_? -> C | 0 if repeat is not None: if n is None: raise GrammarError("Epsilon (0) cannot be repeated with * or +", fname, line) new_nt_id = sym + repeat # Make the new nonterminal and production if not already there if new_nt_id not in nonterminals: new_nt = nonterminals[new_nt_id] = self._make_nonterminal(new_nt_id, fname, line) new_nt.add_ref() # Note that the Earley algorithm is more efficient on left recursion # than middle or right recursion. Therefore it is better to generate # Cx -> Cx C than Cx -> C Cx. # First production: Cx C new_p = Production(fname, line) if repeat != '?': new_p.append(new_nt) # C* / C+ new_p.append(n) # C _add_rhs(new_nt_id, new_p) # Default priority 0 # Second production: epsilon(*, ?) or C(+) new_p = Production(fname, line) if repeat == '+': new_p.append(n) _add_rhs(new_nt_id, new_p) # Default priority 0 # Substitute the Cx in the original production n = nonterminals[new_nt_id] if n is not None: result.append(n) assert len(result) == len(rhs) or (len(rhs) == 1 and rhs[0] == (None, None, None)) nt_id_full = nt_id + nt_suffix if len(result) == 1 and result[0] == nonterminals[nt_id_full]: # Nonterminal derives itself raise GrammarError("Nonterminal {0} deriving itself".format(nt_id_full), fname, line) _add_rhs(nt_id_full, result, priority) def variant_names(nt, vts): """ Returns a list of names with all applicable variant options appended """ result = [ nt ] for v in vts: newresult = [] for vopt in variants[v]: for r in result: newresult.append(r + "_" + vopt) result = newresult return result def apply_to_nonterminals(s, func): """ Parse a nonterminal/var list from string s, then apply func(nt, p) to all nonterminals, where p is the parameter of the pragma """ ix = s.find(')') if ix < 0: raise GrammarError("Expected right parenthesis in pragma", fname, line) param = s[0 : ix].strip() s = s[ix + 1:] nts = s.split() for nt_name in nts: ntv = nt_name.split('/') #if not ntv[0].isidentifier(): # raise GrammarError("Invalid nonterminal name '{0}'".format(ntv[0]), fname, line) for vname in ntv[1:]: if vname not in variants: raise GrammarError("Unknown variant '{0}' for nonterminal '{1}'".format(vname, ntv[0]), fname, line) var_names = variant_names(ntv[0], ntv[1:]) for vname in var_names: if vname not in nonterminals: raise GrammarError("Unknown nonterminal '{0}'".format(vname), fname, line) try: func(nonterminals[vname], param) except: raise GrammarError("Invalid pragma argument '{0}'".format(param), fname, line) if s.startswith('/'): # Definition of variant # A variant is specified as /varname = opt1 opt2 opt3... v = s.split('=', maxsplit = 1) if len(v) != 2: raise GrammarError("Invalid variant syntax", fname, line) vname = v[0].strip()[1:] if "_" in vname or not vname.isidentifier(): # Variant names must be valid identifiers without underscores raise GrammarError("Invalid variant name '{0}'".format(vname), fname, line) v = v[1].split() for vopt in v: if "_" in vopt or not vopt.isidentifier(): # Variant options must be valid identifiers without underscores raise GrammarError("Invalid option '{0}' in variant '{1}'".format(vopt, vname), fname, line) variants[vname] = v elif s.startswith('$'): # Pragma s = s.strip() PRAGMA_SCORE = "$score(" PRAGMA_ROOT = "$root(" PRAGMA_TAG = "$tag(" if s.startswith(PRAGMA_SCORE): # Pragma $score(int) Nonterminal/var1/var2 ... s = s[len(PRAGMA_SCORE):] def set_score(nt, score): self._nt_scores[nt] = int(score) apply_to_nonterminals(s, set_score) elif s.startswith(PRAGMA_TAG): # Pragma $tag(tagstring) Nonterminal/var1/var2 ... s = s[len(PRAGMA_TAG):] apply_to_nonterminals(s, lambda nt, tag : nt.add_tag(tag)) elif s.startswith(PRAGMA_ROOT): # Pragma $root(Nonterminal) # Identify a nonterminal as a secondary parse root if s[-1] != ')': raise GrammarError("Expected right parenthesis in $root() pragma", fname, line) root_nt = s[len(PRAGMA_ROOT):-1].strip() if not root_nt.isidentifier(): raise GrammarError("Invalid nonterminal name '{0}'".format(root_nt), fname, line) if root_nt not in nonterminals: raise GrammarError("Unknown nonterminal '{0}'".format(root_nt)) # Add an implicit reference to the root nonterminals[root_nt].add_ref() self._secondary_roots.append(nonterminals[root_nt]) else: raise GrammarError("Unknown pragma '{0}'".format(s), fname, line) else: # New nonterminal if "→" in s: # Fancy schmancy arrow sign: use it rule = s.split("→", maxsplit=1) else: rule = s.split("->", maxsplit=1) if len(rule) != 2: raise GrammarError("Invalid syntax", fname, line) # Split nonterminal spec into name and variant(s), # i.e. NtName/var1/var2... ntv = rule[0].strip().split('/') current_NT = nt = ntv[0] current_variants = ntv[1:] if not nt.isidentifier(): raise GrammarError("Invalid nonterminal name '{0}'".format(nt), fname, line) for vname in current_variants: if vname not in variants: raise GrammarError("Unknown variant '{0}' for nonterminal '{1}'".format(vname, nt), fname, line) var_names = variant_names(nt, current_variants) # Add all previously unknown nonterminal variants for nt_var in var_names: if nt_var in nonterminals: cnt = nonterminals[nt_var] else: cnt = self._make_nonterminal(nt_var, fname, line) nonterminals[nt_var] = cnt if self._root is None: # Remember first nonterminal as the root self._root = cnt self._root.add_ref() # Implicitly referenced if cnt not in grammar: grammar[cnt] = [ ] sep = '|' # Default production separator if '>' in rule[1]: # Looks like a priority specification between productions if '|' in rule[1]: raise GrammarError("Cannot mix '|' and '>' between productions", fname, line) sep = '>' for priority, prod in enumerate(rule[1].split(sep)): # Add the productions on the right hand side, delimited by '|' or '>' _parse_rhs(current_NT, current_variants, prod, priority if sep == '>' else 0) # Main parse loop try: with open(fname, "r", encoding="utf-8") as inp: # Read grammar file line-by-line for s in inp: line += 1 # Ignore comments ix = s.find('#') if ix >= 0: s = s[0:ix] s = s.rstrip() if not s: continue # If line starts with a blank, assume it's a continuation if s[0].isspace(): current_line += s continue # New item starting: parse the previous one and start a new parse_line(current_line) current_line = s # Parse the final chunk parse_line(current_line) except (IOError, OSError): raise GrammarError("Unable to open or read grammar file", fname, 0) # Check all nonterminals to verify that they have productions and are referenced for nt in nonterminals.values(): if verbose and not nt.has_ref: # Emit a warning message if verbose=True print ("Nonterminal {0} is never referenced in a production".format(nt)) # raise GrammarError("Nonterminal {0} is never referenced in a production".format(nt), nt.fname(), nt.line()) if nt not in grammar: raise GrammarError("Nonterminal {0} is referenced but not defined".format(nt), nt.fname, nt.line) for nt, plist in grammar.items(): if len(plist) == 0: raise GrammarError("Nonterminal {0} has no productions".format(nt), nt.fname, nt.line) else: for _, p in plist: if len(p) == 1 and p[0] == nt: raise GrammarError("Nonterminal {0} produces itself".format(nt), p.fname, p.line) # Check that all nonterminals derive terminal strings agenda = [ nt for nt in nonterminals.values() ] der_t = set() while agenda: reduced = False for nt in agenda: for _, p in grammar[nt]: if all(True if isinstance(s, Terminal) else s in der_t for s in p): der_t.add(nt) break if nt in der_t: reduced = True if not reduced: break agenda = [ nt for nt in nonterminals.values() if nt not in der_t ] if agenda: raise GrammarError("Nonterminals {0} do not derive terminal strings" .format(", ".join([str(nt) for nt in agenda])), fname, 0) # Short-circuit nonterminals that point directly and uniquely to other nonterminals. # Becausee this creates a gap between the original grammar # and the resulting trees, we only do this for nonterminals with variants # that do not have a $score pragma shortcuts = { } # Dictionary of shortcuts for nt, plist in grammar.items(): if not "_" in nt.name: # 'Pure' nonterminal with no variants: don't shortcut continue if self.nt_score(nt) != 0 or nt.has_tags: # Nonterminal has a score adjustment or a tag: don't shortcut continue if len(plist) == 1 and len(plist[0][1]) == 1 and isinstance(plist[0][1][0], Nonterminal): # This nonterminal has only one production, with only one nonterminal item target = plist[0][1][0] assert target != nt while target in shortcuts: # Find ultimate destination of shortcut assert target != shortcuts[target] target = shortcuts[target] shortcuts[nt] = target # Go through all productions and replace the shortcuts with their targets for nt, plist in grammar.items(): for _, p in plist: for ix, s in enumerate(p): if isinstance(s, Nonterminal) and s in shortcuts: # Replace the nonterminal in the production target = shortcuts[s] #if verbose: # # Print informational message in verbose mode # print("Production of {2}: Replaced {0} with {1}" # .format(s, target, nt)) p[ix] = target # Now, after applying shortcuts, check that all nonterminals are reachable from the root unreachable = { nt for nt in nonterminals.values() } def _remove(nt): """ Recursively remove all nonterminals that are reachable from nt """ unreachable.remove(nt) for _, p in grammar[nt]: for s in p: if isinstance(s, Nonterminal) and s in unreachable: _remove(s) # Remove the main root and any secondary roots _remove(self._root) for r in self._secondary_roots: _remove(r) if unreachable: if verbose: # Emit a warning message if verbose=True print("The following nonterminals are unreachable from the root\nand will be removed from the grammar:") with changedlocale() as strxfrm: for nt in sorted([ str(nt) for nt in unreachable ], key = strxfrm): print("* {0}".format(str(nt))) # Simplify the grammar dictionary by removing unreachable nonterminals for nt in unreachable: del grammar[nt] del nonterminals[nt.name] # Reassign indices for nonterminals to avoid gaps in the number sequence # Nonterminals are indexed downwards from -1 self._nonterminals_by_ix = { -1 - ix : nonterminals[key] for ix, key in enumerate(nonterminals.keys()) } for key, nt in self._nonterminals_by_ix.items(): nt.set_index(key) # Reassign indices for terminals # Terminals are indexed upwards from 1 self._terminals_by_ix = { ix + 1 : terminals[key] for ix, key in enumerate(terminals.keys()) } for key, t in self._terminals_by_ix.items(): t.set_index(key) # Make a dictionary of productions by integer index >= 0 for plist in grammar.values(): for _, p in plist: self._productions_by_ix[p.index] = p # Grammar successfully read: note the file name and timestamp self._file_name = fname self._file_time = datetime.fromtimestamp(os.path.getmtime(fname)) if write_binary: # Check whether to write a fresh binary file fname += ".bin" # By default Reynir.grammar.bin try: binary_file_time = datetime.fromtimestamp(os.path.getmtime(fname)) except os.error: binary_file_time = None # if Settings.DEBUG or binary_file_time is None or binary_file_time < self._file_time: if binary_file_time is None or binary_file_time < self._file_time: # No binary file or older than text file: write a fresh one self._write_binary(fname)
def fetch_articles( topic=None, offset=0, limit=_DEFAULT_NUM_ARTICLES, start=None, location=None, country=None, root=None, author=None, enclosing_session=None, ): """ Return a list of articles in chronologically reversed order. Articles can be filtered by start date, location, country, root etc. """ toplist = [] with SessionContext(read_only=True, session=enclosing_session) as session: q = (session.query(Article).filter(Article.tree != None).filter( Article.timestamp != None).filter( Article.timestamp <= datetime.utcnow()).filter( Article.heading > "").filter( Article.num_sentences > 0).join(Root).filter( Root.visible == True)) # Filter by date if start is not None: q = q.filter(Article.timestamp > start) if location or country: q = q.join(Location) if location: # Filter by location q = q.filter(Location.name == location) if country: # Filter by country code q = q.filter(Location.country == country) # Filter by source (root) using domain (e.g. "kjarninn.is") if root: q = q.filter(Root.domain == root) # Filter by author name if author: q = q.filter(Article.author == author) # Filter by topic identifier if topic: q = q.join(ArticleTopic).join(Topic).filter( Topic.identifier == topic) q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit) class ArticleDisplay: """ Utility class to carry information about an article to the web template """ def __init__( self, heading, timestamp, url, uuid, num_sentences, num_parsed, icon, localized_date, source, ): self.heading = heading self.timestamp = timestamp self.url = url self.uuid = uuid self.num_sentences = num_sentences self.num_parsed = num_parsed self.icon = icon self.localized_date = localized_date self.source = source @property def width(self): """ The ratio of parsed sentences to the total number of sentences, expressed as a percentage string """ if self.num_sentences == 0: return "0%" return "{0}%".format( (100 * self.num_parsed) // self.num_sentences) @property def time(self): return self.timestamp.isoformat()[11:16] @property def date(self): if datetime.today().year == self.timestamp.year: return self.localized_date return self.fulldate @property def fulldate(self): return self.localized_date + self.timestamp.strftime(" %Y") with changedlocale(category="LC_TIME"): for a in q: # Instantiate article objects from results source = a.root.domain icon = source + ".png" locdate = a.timestamp.strftime("%-d. %b") d = ArticleDisplay( heading=a.heading, timestamp=a.timestamp, url=a.url, uuid=a.id, num_sentences=a.num_sentences, num_parsed=a.num_parsed, icon=icon, localized_date=locdate, source=source, ) toplist.append(d) return toplist
def wordfreq(): """ Return word frequency chart data for a given time period. """ resp = dict(err=True) # Create datetime objects from query string args try: date_fmt = "%Y-%m-%d" date_from = datetime.strptime(request.args.get("date_from"), date_fmt) date_to = datetime.strptime(request.args.get("date_to"), date_fmt) except Exception as e: logging.warning("Failed to parse date arg: {0}".format(e)) return better_jsonify(**resp) # Words parameter should be one or more word lemmas (w. optional category) warg = request.args.get("words") if not warg: return better_jsonify(**resp) # Split on comma or whitespace, limit to max 6 words warg = warg.strip().replace(" ", " ").replace(",", " ") words = [w.strip() for w in warg.split()][:6] # Word categories can be specified thus: "maður:kk" words = [tuple(w.split(":")) for w in words] with BIN_Db.get_db() as db: def cat4word(w): _, meanings = db.lookup_word(w, auto_uppercase=True) if meanings: # Give precedence to lemmas, e.g. interpret "reima" as # verb rather than gen. pl. of fem. noun "reim" lemmas = list(filter(lambda x: x.stofn == w, meanings)) return lemmas[0].ordfl if lemmas else meanings[0].ordfl return "hk" # Get word category (ordfl) for each word, if needed valid_cats = ["kk", "kvk", "hk", "lo", "so"] for i, w in enumerate(words): if len(w) < 2 or w[1] not in valid_cats: words[i] = (w[0], cat4word(w[0])) colors = list(_LINE_COLORS) # Generate date labels now = datetime.utcnow() delta = date_to - date_from labels = [date_from + timedelta(days=i) for i in range(delta.days + 1)] with changedlocale(category="LC_TIME"): labels = [ l.strftime("%-d. %b") if l.year == now.year else l.strftime("%-d. %b %Y") for l in labels ] # More human readble description of word categories CAT_DESC = { "kk": "kk. no.", "kvk": "kvk. no.", "hk": "hk. no.", "lo": "lo.", "so": "so.", } # Create datasets for front-end chart with SessionContext(commit=False) as session: data = dict(labels=labels, datasets=[]) for w in words: # Look up frequency of word for the given period res = WordFrequencyQuery.fetch(w[0], w[1], date_from, date_to, enclosing_session=session) # Generate data and config for chart label = "{0} ({1})".format(w[0], CAT_DESC.get(w[1])) ds = dict(label=label, fill=False, lineTension=0) ds["borderColor"] = ds["backgroundColor"] = colors.pop(0) ds["data"] = [r[1] for r in res] data["datasets"].append(ds) # Create response resp["err"] = False resp["data"] = data # Update word list client-side resp["words"] = ", ".join([":".join(w) for w in words]) return better_jsonify(**resp)