def query_title(query: Query, session: Session, title: str) -> AnswerTuple: """ A query for a person by title """ # !!! Consider doing a LIKE '%title%', not just LIKE 'title%' # We impose a LIMIT of 1024 on each query result, # since the query may return many names (for instance 'Hver er formaður?'), # and getting more name mentions than this is not likely to significantly # affect the outcome. QUERY_LIMIT = 1024 rd: RegisterType = defaultdict(dict) title_lc = title.lower() # Query by lowercase title q = (session.query( Person.name, Article.id, Article.timestamp, Article.heading, Root.domain, Article.url, ).filter( Person.title_lc.like(title_lc + " %") | (Person.title_lc == title_lc)).filter(Root.visible == True).join( Article, Article.url == Person.article_url).join(Root).order_by( desc(cast(Column, Article.timestamp))).limit(QUERY_LIMIT).all()) # Append names from the persons table append_names(rd, q, prop_func=lambda x: x.name) # Also append definitions from the entities table, if any q = (session.query( Entity.name, Article.id, Article.timestamp, Article.heading, Root.domain, Article.url, ).filter(Entity.definition == title).filter(Root.visible == True).join( Article, Article.url == Entity.article_url).join(Root).order_by( desc(cast(Column, Article.timestamp))).limit(QUERY_LIMIT).all()) append_names(rd, q, prop_func=lambda x: x.name) response = make_response_list(rd) answer: str voice_answer: str if response and title and "answer" in response[0]: first_response = response[0] # Return 'Seðlabankastjóri er Már Guðmundsson.' upper_title = cap_first(title) answer = first_response["answer"] voice_answer = upper_title + " er " + answer + "." # Store the person name in the query context # so it can be referred to in subsequent queries query.set_context({"person_name": answer}) if first_response.get("sources"): first_source = first_response["sources"][0]["domain"] query.set_source(first_source) else: answer = "Ekkert nafn finnst með titilinn '" + title + "'." voice_answer = "Ég veit ekki hver er " + title + "." return response, answer, voice_answer
def sentence_stream(limit=None, skip=None, skip_errors=True): """ Generator of a sentence stream consisting of `limit` sentences (or less) from the most recently parsed articles. Each sentence is a list of token dicts. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 skipped = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue if skip is not None and skipped < skip: # If requested, skip sentences from the front # (useful for test set) skipped += 1 continue # Yield the sentence as a fresh token list yield [t for t in sent] # Are we done? count += 1 if limit is not None and count >= limit: return
def token_stream( limit: Optional[int] = None, skip_errors: bool = True) -> Iterator[Optional[TokenDict]]: """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q: SqlQuery[ArticleRow] = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(cast(Column, ArticleRow.parsed))).yield_per(200)) count = 0 for a in q: assert a is not None if not a.tokens: continue doc = cast(PgsList, json.loads(a.tokens)) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def token_stream(limit=None, skip_errors=True): """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD): """ Return a list of recent locations along with the list of articles in which they are mentioned """ with SessionContext(read_only=True) as session: q = (session.query( Location.name, Location.kind, Location.country, Location.article_url, Location.latitude, Location.longitude, Article.id, Article.heading, Root.domain, ).join(Article, Article.url == Location.article_url).filter( Article.timestamp > datetime.utcnow() - timedelta(days=days)).join(Root).filter(Root.visible)) # Filter by kind if kind: q = q.filter(Location.kind == kind) q = q.order_by(desc(Article.timestamp)) # Group articles by unique location locs = defaultdict(list) for r in q.all(): article = { "url": r.article_url, "id": r.id, "heading": r.heading, "domain": r.domain, } k = (r.name, r.kind, r.country, r.latitude, r.longitude) locs[k].append(article) # Create top locations list sorted by article count loclist = [] for k, v in locs.items(): (name, kind, country, lat, lon) = k # Unpack tuple key # Google map links currently use the placename instead of # coordinates. This works well for most Icelandic and # international placenames, but fails on some. map_url = GMAPS_PLACE_URL.format(name) # if lat and lon: # map_url = GMAPS_COORD_URL.format(lat, lon, "7z") loclist.append({ "name": name, "kind": kind, "country": country, "map_url": map_url, "articles": v, }) loclist.sort(key=lambda x: len(x["articles"]), reverse=True) return loclist[:limit]
def recent_persons(limit=_RECENT_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() with SessionContext(read_only=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id) .join(Article) .join(Root) .filter(Root.visible) .order_by(desc(Article.timestamp))[ 0 : limit * 2 ] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= _MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= _MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0] ): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def wordfreq_details(): """ Return list of articles containing certain words over a given period. """ resp: Dict[str, Any] = dict(err=True) words = _str2words(request.args.get("words")) if not words: return better_jsonify(**resp) # Parse date args try: date_fmt = "%Y-%m-%d" date_from = datetime.strptime(request.args.get("date_from", ""), date_fmt) dto = request.args.get("date_to") if dto: date_to = datetime.strptime(dto, date_fmt) else: # If only one date provided, assume it's a period spanning a single day date_to = date_from + timedelta(days=1) except Exception as e: logging.warning("Failed to parse date arg: {0}".format(e)) return better_jsonify(**resp) # Fetch list of articles for each word for the given period wlist = list() colors = list(_LINE_COLORS) with SessionContext(read_only=True) as session: for wd, cat in words: q = (session.query( Article.id, Article.heading, Root.domain, Word.cnt, Word.stem).join(Article, Article.id == Word.article_id).filter( Article.timestamp >= date_from).filter( Article.timestamp < date_to).filter( Word.stem == wd).filter( Word.cat == cat).join(Root).order_by( desc(cast(Column, Article.timestamp)))) articles = [{ "id": a[0], "heading": a[1], "domain": a[2], "cnt": a[3] } for a in q.all()] wlist.append({ "word": wd, "cat": cat, "cnt": sum(a["cnt"] for a in articles), "articles": articles, "color": colors.pop(0), "desc": _desc4word((wd, cat)), }) resp["err"] = False resp["payload"] = render_template("words/details.html", words=wlist) return better_jsonify(**resp)
def articles( cls, criteria: Mapping[str, Any], enclosing_session: Optional[Session] = None ) -> Iterator["Article"]: """ Generator of Article objects from the database that meet the given criteria """ # The criteria are currently "timestamp", "author" and "domain", # as well as "order_by_parse" which if True indicates that the result # should be ordered with the most recently parsed articles first. with SessionContext(commit=True, read_only=True, session=enclosing_session) as session: # Only fetch articles that have a parse tree q: SqlQuery[ArticleRow] = session.query(ArticleRow).filter( ArticleRow.tree != None) # timestamp is assumed to contain a tuple: (from, to) if criteria and "timestamp" in criteria: ts = criteria["timestamp"] q = q.filter(ArticleRow.timestamp >= ts[0]).filter( ArticleRow.timestamp < ts[1]) if criteria and "author" in criteria: author = criteria["author"] q = q.filter(ArticleRow.author == author) if criteria and ("visible" in criteria or "domain" in criteria): # Need a join with Root for these criteria q = q.join(Root) if "visible" in criteria: # Return only articles from roots with the specified visibility visible = criteria["visible"] assert isinstance(visible, bool) q = q.filter(Root.visible == visible) if "domain" in criteria: # Return only articles from the specified domain domain = criteria["domain"] assert isinstance(domain, str) q = q.filter(Root.domain == domain) if criteria and criteria.get("order_by_parse"): # Order with newest parses first q = q.order_by(desc(cast(Column, ArticleRow.parsed))) parsed_after = criteria.get("parse_date_gt") if parsed_after is not None: q = q.filter(ArticleRow.parsed >= parsed_after) for arow in q.yield_per(500): yield cls._init_from_row(arow)
def suggest(limit=10): """ Return suggestions for query field autocompletion """ limit = request.args.get("limit", limit) txt = request.args.get("q", "").strip() suggestions: List[Dict[str, str]] = [] whois_prefix = "hver er " whatis_prefix = "hvað er " prefix = None if txt.lower().startswith(whois_prefix): prefix = whois_prefix elif txt.lower().startswith(whatis_prefix): prefix = whatis_prefix if not prefix: return better_jsonify(suggestions=suggestions) with SessionContext(read_only=True) as session: name = txt[len(prefix) :].strip() model_col = None # Hver er Jón Jónsson ? if prefix is whois_prefix and name[0].isupper(): model_col = Person.name # Hver er seðlabankastjóri? elif prefix is whois_prefix: model_col = Person.title # Hvað er UNESCO? elif prefix is whatis_prefix: model_col = Entity.name assert model_col is not None q = ( session.query(model_col, dbfunc.count(Article.id).label("total")) .filter(model_col.ilike(name + "%")) .join(Article) .group_by(model_col) .order_by(desc("total")) .limit(limit) .all() ) prefix = prefix[:1].upper() + prefix[1:].lower() suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q] return better_jsonify(suggestions=suggestions)
def last_answer(self, *, within_minutes=5): """ Return the last answer given to this client, by default within the last 5 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.answer, QueryRow.voice).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() return None if last is None else tuple(last)
def suggest(limit=10): """ Return suggestions for query field autocompletion """ limit = request.args.get("limit", limit) txt = request.args.get("q", "").strip() suggestions = list() whois_prefix = "hver er " whatis_prefix = "hvað er " prefix = None if txt.lower().startswith(whois_prefix): prefix = whois_prefix elif txt.lower().startswith(whatis_prefix): prefix = whatis_prefix if not prefix: return better_jsonify(suggestions=suggestions) with SessionContext(read_only=True) as session: name = txt[len(prefix) :].strip() model_col = None # Hver er Jón Jónsson ? if prefix is whois_prefix and name[0].isupper(): model_col = Person.name # Hver er seðlabankastjóri? elif prefix is whois_prefix: model_col = Person.title # Hvað er UNESCO? elif prefix is whatis_prefix: model_col = Entity.name q = ( session.query(model_col, dbfunc.count(Article.id).label("total")) .filter(model_col.ilike(name + "%")) .join(Article) .group_by(model_col) .order_by(desc("total")) .limit(limit) .all() ) prefix = prefix[:1].upper() + prefix[1:].lower() suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q] return better_jsonify(suggestions=suggestions)
def parsefail(): """ Handler for a page showing recent sentences where parsing failed """ num = request.args.get("num", PARSEFAIL_DEFAULT) try: num = min(int(num), PARSEFAIL_MAX) except Exception: num = PARSEFAIL_DEFAULT with SessionContext(read_only=True) as session: q = ( session.query(Article.id, Article.timestamp, Article.tokens) .filter(Article.tree != None) .filter(Article.timestamp != None) .filter(Article.timestamp <= datetime.utcnow()) .filter(Article.heading > "") .filter(Article.num_sentences > 0) .filter(Article.num_sentences != Article.num_parsed) .order_by(desc(Article.timestamp)) .limit(num) ) sfails = [] for a in q.all(): try: tokens = json.loads(a.tokens) except Exception: continue # Paragraphs for p in tokens: # Sentences for s in p: # Tokens for t in s: if "err" in t: # Only add well-formed sentences that start # with a capital letter and end with a period if s[0]["x"][0].isupper() and s[-1]["x"] == ".": sfails.append([s]) break return render_template( "parsefail.html", title="Ógreindar setningar", sentences=sfails, num=num )
def fetch_context(self, *, within_minutes=10): """ Return the context from the last answer given to this client, by default within the last 10 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.context).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() if ctx is None: return None # This function normally returns a dict that has been decoded from JSON return None if ctx is None else ctx[0]
def parsefail(): """ Handler for a page showing recent sentences where parsing failed """ num = request.args.get("num", PARSEFAIL_DEFAULT) try: num = min(int(num), PARSEFAIL_MAX) except: num = PARSEFAIL_DEFAULT with SessionContext(read_only=True) as session: q = ( session.query(Article.id, Article.timestamp, Article.tokens) .filter(Article.tree != None) .filter(Article.timestamp != None) .filter(Article.timestamp <= datetime.utcnow()) .filter(Article.heading > "") .filter(Article.num_sentences > 0) .filter(Article.num_sentences != Article.num_parsed) .order_by(desc(Article.timestamp)) .limit(num) ) sfails = [] for a in q.all(): tokens = json.loads(a.tokens) # Paragraphs for p in tokens: # Sentences for s in p: # Tokens for t in s: if "err" in t: # Only add well-formed sentences that start # with a capital letter and end with a period if s[0]["x"][0].isupper() and s[-1]["x"] == ".": sfails.append([s]) break return render_template("parsefail.html", sentences=json.dumps(sfails), num=num)
def fetch_articles( topic=None, offset=0, limit=_DEFAULT_NUM_ARTICLES, start=None, location=None, country=None, root=None, author=None, enclosing_session=None, ): """ Return a list of articles in chronologically reversed order. Articles can be filtered by start date, location, country, root etc. """ toplist = [] with SessionContext(read_only=True, session=enclosing_session) as session: q = (session.query(Article).filter(Article.tree != None).filter( Article.timestamp != None).filter( Article.timestamp <= datetime.utcnow()).filter( Article.heading > "").filter( Article.num_sentences > 0).join(Root).filter( Root.visible == True)) # Filter by date if start is not None: q = q.filter(Article.timestamp > start) if location or country: q = q.join(Location) if location: # Filter by location q = q.filter(Location.name == location) if country: # Filter by country code q = q.filter(Location.country == country) # Filter by source (root) using domain (e.g. "kjarninn.is") if root: q = q.filter(Root.domain == root) # Filter by author name if author: q = q.filter(Article.author == author) # Filter by topic identifier if topic: q = q.join(ArticleTopic).join(Topic).filter( Topic.identifier == topic) q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit) class ArticleDisplay: """ Utility class to carry information about an article to the web template """ def __init__( self, heading, timestamp, url, uuid, num_sentences, num_parsed, icon, localized_date, source, ): self.heading = heading self.timestamp = timestamp self.url = url self.uuid = uuid self.num_sentences = num_sentences self.num_parsed = num_parsed self.icon = icon self.localized_date = localized_date self.source = source @property def width(self): """ The ratio of parsed sentences to the total number of sentences, expressed as a percentage string """ if self.num_sentences == 0: return "0%" return "{0}%".format( (100 * self.num_parsed) // self.num_sentences) @property def time(self): return self.timestamp.isoformat()[11:16] @property def date(self): if datetime.today().year == self.timestamp.year: return self.localized_date return self.fulldate @property def fulldate(self): return self.localized_date + self.timestamp.strftime(" %Y") with changedlocale(category="LC_TIME"): for a in q: # Instantiate article objects from results source = a.root.domain icon = source + ".png" locdate = a.timestamp.strftime("%-d. %b") d = ArticleDisplay( heading=a.heading, timestamp=a.timestamp, url=a.url, uuid=a.id, num_sentences=a.num_sentences, num_parsed=a.num_parsed, icon=icon, localized_date=locdate, source=source, ) toplist.append(d) return toplist
def index(): return render_template('index.html', snatched=db.Snatched.select().order_by( db.desc(db.Snatched.date)).limit(20), announced=db.Announced.select().order_by( db.desc(db.Announced.date)).limit(20))
def process_query(q, voice, *, auto_uppercase=False, location=None, remote_addr=None, client_id=None, client_type=None, client_version=None, bypass_cache=False, private=False): """ Process an incoming natural language query. If voice is True, return a voice-friendly string to be spoken to the user. If auto_uppercase is True, the string probably came from voice input and we need to intelligently guess which words in the query should be upper case (to the extent that it matters). The q parameter can either be a single query string or an iterable of strings that will be processed in order until a successful one is found. """ now = datetime.utcnow() result = None client_id = client_id[:256] if client_id else None first_clean_q = None first_qtext = None with SessionContext(commit=True) as session: if isinstance(q, str): # This is a single string it = [q] else: # This should be an array of strings, # in decreasing priority order it = q # Iterate through the submitted query strings, # assuming that they are in decreasing order of probability, # attempting to execute them in turn until we find # one that works (or we're stumped) for qtext in it: qtext = qtext.strip() clean_q = qtext.rstrip("?") if first_clean_q is None: # Store the first (most likely) query string # that comes in from the speech-to-text processor, # since we want to return that one to the client # if no query string is matched - not the last # (least likely) query string first_clean_q = clean_q first_qtext = qtext # First, look in the query cache for the same question # (in lower case), having a not-expired answer cached_answer = None if voice and not bypass_cache: # Only use the cache for voice queries # (handling detailed responses in other queries # is too much for the cache) cached_answer = (session.query(QueryRow).filter( QueryRow.question_lc == clean_q.lower()).filter( QueryRow.expires >= now).order_by( desc(QueryRow.expires)).limit(1).one_or_none()) if cached_answer is not None: # The same question is found in the cache and has not expired: # return the previous answer a = cached_answer result = dict( valid=True, q_raw=qtext, q=a.bquestion, answer=a.answer, response=dict(answer=a.answer or ""), voice=a.voice, expires=a.expires, qtype=a.qtype, key=a.key, ) # !!! TBD: Log the cached answer as well? return result query = Query(session, qtext, voice, auto_uppercase, location, client_id) result = query.execute() if result["valid"] and "error" not in result: # Successful: our job is done if not private: # If not in private mode, log the result try: qrow = QueryRow( timestamp=now, interpretations=it, question=clean_q, # bquestion is the beautified query string bquestion=result["q"], answer=result["answer"], voice=result.get("voice"), # Only put an expiration on voice queries expires=query.expires if voice else None, qtype=result.get("qtype"), key=result.get("key"), latitude=location[0] if location else None, longitude=location[1] if location else None, # Client identifier client_id=client_id, client_type=client_type or None, client_version=client_version or None, # IP address remote_addr=remote_addr or None, # Context dict, stored as JSON, if present # (set during query execution) context=query.context, # All other fields are set to NULL ) session.add(qrow) except Exception as e: logging.error("Error logging query: {0}".format(e)) return result # Failed to answer the query, i.e. no query processor # module was able to parse the query and provide an answer result = result or dict(valid=False, error="E_NO_RESULT") if first_clean_q: # Re-insert the query data from the first (most likely) # string returned from the speech-to-text processor, # replacing residual data that otherwise would be there # from the last (least likely) query string result["q_raw"] = first_qtext result["q"] = beautify_query(first_qtext) # Attempt to include a helpful response in the result Query.try_to_help(first_clean_q, result) # Log the failure qrow = QueryRow( timestamp=now, interpretations=it, question=first_clean_q, bquestion=result["q"], answer=result.get("answer"), voice=result.get("voice"), error=result.get("error"), latitude=location[0] if location else None, longitude=location[1] if location else None, # Client identifier client_id=client_id, client_type=client_type or None, client_version=client_version or None, # IP address remote_addr=remote_addr or None # All other fields are set to NULL ) session.add(qrow) return result
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD): """ Return a list of recent locations along with the list of articles in which they are mentioned """ with SessionContext(read_only=True) as session: q = ( session.query( Location.name, Location.kind, Location.country, Location.article_url, Location.latitude, Location.longitude, Article.id, Article.heading, Root.domain, ) .join(Article, Article.url == Location.article_url) .filter(Article.timestamp > datetime.utcnow() - timedelta(days=days)) .join(Root) .filter(Root.visible) ) # Filter by kind if kind: q = q.filter(Location.kind == kind) q = q.order_by(desc(Article.timestamp)) # Group articles by unique location locs = defaultdict(list) for r in q.all(): article = { "url": r.article_url, "id": r.id, "heading": r.heading, "domain": r.domain, } k = (r.name, r.kind, r.country, r.latitude, r.longitude) locs[k].append(article) # Create top locations list sorted by article count loclist = [] for k, v in locs.items(): (name, kind, country, lat, lon) = k # Unpack tuple key # Google map links currently use the placename instead of # coordinates. This works well for most Icelandic and # international placenames, but fails on some. map_url = GMAPS_PLACE_URL.format(name) # if lat and lon: # map_url = GMAPS_COORD_URL.format(lat, lon, "7z") loclist.append( { "name": name, "kind": kind, "country": country, "map_url": map_url, "articles": v, } ) loclist.sort(key=lambda x: len(x["articles"]), reverse=True) return loclist[:limit]