Ejemplo n.º 1
0
def query_title(query: Query, session: Session, title: str) -> AnswerTuple:
    """ A query for a person by title """
    # !!! Consider doing a LIKE '%title%', not just LIKE 'title%'
    # We impose a LIMIT of 1024 on each query result,
    # since the query may return many names (for instance 'Hver er formaður?'),
    # and getting more name mentions than this is not likely to significantly
    # affect the outcome.
    QUERY_LIMIT = 1024
    rd: RegisterType = defaultdict(dict)
    title_lc = title.lower()  # Query by lowercase title
    q = (session.query(
        Person.name,
        Article.id,
        Article.timestamp,
        Article.heading,
        Root.domain,
        Article.url,
    ).filter(
        Person.title_lc.like(title_lc + " %")
        | (Person.title_lc == title_lc)).filter(Root.visible == True).join(
            Article, Article.url == Person.article_url).join(Root).order_by(
                desc(cast(Column,
                          Article.timestamp))).limit(QUERY_LIMIT).all())
    # Append names from the persons table
    append_names(rd, q, prop_func=lambda x: x.name)
    # Also append definitions from the entities table, if any
    q = (session.query(
        Entity.name,
        Article.id,
        Article.timestamp,
        Article.heading,
        Root.domain,
        Article.url,
    ).filter(Entity.definition == title).filter(Root.visible == True).join(
        Article, Article.url == Entity.article_url).join(Root).order_by(
            desc(cast(Column, Article.timestamp))).limit(QUERY_LIMIT).all())
    append_names(rd, q, prop_func=lambda x: x.name)
    response = make_response_list(rd)
    answer: str
    voice_answer: str
    if response and title and "answer" in response[0]:
        first_response = response[0]
        # Return 'Seðlabankastjóri er Már Guðmundsson.'
        upper_title = cap_first(title)
        answer = first_response["answer"]
        voice_answer = upper_title + " er " + answer + "."
        # Store the person name in the query context
        # so it can be referred to in subsequent queries
        query.set_context({"person_name": answer})
        if first_response.get("sources"):
            first_source = first_response["sources"][0]["domain"]
            query.set_source(first_source)
    else:
        answer = "Ekkert nafn finnst með titilinn '" + title + "'."
        voice_answer = "Ég veit ekki hver er " + title + "."
    return response, answer, voice_answer
Ejemplo n.º 2
0
    def sentence_stream(limit=None, skip=None, skip_errors=True):
        """ Generator of a sentence stream consisting of `limit`
            sentences (or less) from the most recently parsed articles.
            Each sentence is a list of token dicts. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            skipped = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        if skip is not None and skipped < skip:
                            # If requested, skip sentences from the front
                            # (useful for test set)
                            skipped += 1
                            continue
                        # Yield the sentence as a fresh token list
                        yield [t for t in sent]
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
Ejemplo n.º 3
0
    def token_stream(
            limit: Optional[int] = None,
            skip_errors: bool = True) -> Iterator[Optional[TokenDict]]:
        """ Generator of a token stream consisting of `limit` sentences
            (or less) from the most recently parsed articles. After
            each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q: SqlQuery[ArticleRow] = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(cast(Column, ArticleRow.parsed))).yield_per(200))

            count = 0
            for a in q:
                assert a is not None
                if not a.tokens:
                    continue
                doc = cast(PgsList, json.loads(a.tokens))
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
Ejemplo n.º 4
0
    def token_stream(limit=None, skip_errors=True):
        """ Generator of a token stream consisting of `limit` sentences
            (or less) from the most recently parsed articles. After
            each sentence, None is yielded. """
        with SessionContext(commit=True, read_only=True) as session:

            q = (session.query(
                ArticleRow.url, ArticleRow.parsed,
                ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by(
                    desc(ArticleRow.parsed)).yield_per(200))

            count = 0
            for a in q:
                doc = json.loads(a.tokens)
                for pg in doc:
                    for sent in pg:
                        if not sent:
                            continue
                        if skip_errors and any("err" in t for t in sent):
                            # Skip error sentences
                            continue
                        for t in sent:
                            # Yield the tokens
                            yield t
                        yield None  # End-of-sentence marker
                        # Are we done?
                        count += 1
                        if limit is not None and count >= limit:
                            return
Ejemplo n.º 5
0
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD):
    """ Return a list of recent locations along with the list of
        articles in which they are mentioned """

    with SessionContext(read_only=True) as session:
        q = (session.query(
            Location.name,
            Location.kind,
            Location.country,
            Location.article_url,
            Location.latitude,
            Location.longitude,
            Article.id,
            Article.heading,
            Root.domain,
        ).join(Article, Article.url == Location.article_url).filter(
            Article.timestamp > datetime.utcnow() -
            timedelta(days=days)).join(Root).filter(Root.visible))

        # Filter by kind
        if kind:
            q = q.filter(Location.kind == kind)

        q = q.order_by(desc(Article.timestamp))

        # Group articles by unique location
        locs = defaultdict(list)
        for r in q.all():
            article = {
                "url": r.article_url,
                "id": r.id,
                "heading": r.heading,
                "domain": r.domain,
            }
            k = (r.name, r.kind, r.country, r.latitude, r.longitude)
            locs[k].append(article)

        # Create top locations list sorted by article count
        loclist = []
        for k, v in locs.items():
            (name, kind, country, lat, lon) = k  # Unpack tuple key
            # Google map links currently use the placename instead of
            # coordinates. This works well for most Icelandic and
            # international placenames, but fails on some.
            map_url = GMAPS_PLACE_URL.format(name)
            # if lat and lon:
            #     map_url = GMAPS_COORD_URL.format(lat, lon, "7z")

            loclist.append({
                "name": name,
                "kind": kind,
                "country": country,
                "map_url": map_url,
                "articles": v,
            })
        loclist.sort(key=lambda x: len(x["articles"]), reverse=True)

        return loclist[:limit]
Ejemplo n.º 6
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Ejemplo n.º 7
0
def recent_persons(limit=_RECENT_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()

    with SessionContext(read_only=True) as session:

        q = (
            session.query(Person.name, Person.title, Person.article_url, Article.id)
            .join(Article)
            .join(Root)
            .filter(Root.visible)
            .order_by(desc(Article.timestamp))[
                0 : limit * 2
            ]  # Go through up to 2 * N records
        )

        def is_better_title(new_title, old_title):
            len_new = len(new_title)
            len_old = len(old_title)
            if len_old >= _MAX_TITLE_LENGTH:
                # Too long: we want a shorter one
                return len_new < len_old
            if len_new >= _MAX_TITLE_LENGTH:
                # This one is too long: we don't want it
                return False
            # Otherwise, longer is better
            return len_new > len_old

        with BIN_Db.get_db() as bindb:
            for p in q:
                # Insert the name into the list if it's not already there,
                # or if the new title is longer than the previous one
                if p.name not in toplist or is_better_title(
                    p.title, toplist[p.name][0]
                ):
                    toplist[p.name] = (
                        correct_spaces(p.title),
                        p.article_url,
                        p.id,
                        bindb.lookup_name_gender(p.name),
                    )
                    if len(toplist) >= limit:
                        # We now have as many names as we initially wanted: terminate the loop
                        break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted(
            [
                dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
                for name, tu in toplist.items()
            ],
            key=lambda x: strxfrm(x["name"]),
        )
Ejemplo n.º 8
0
def wordfreq_details():
    """ Return list of articles containing certain words over a given period. """
    resp: Dict[str, Any] = dict(err=True)

    words = _str2words(request.args.get("words"))
    if not words:
        return better_jsonify(**resp)

    # Parse date args
    try:
        date_fmt = "%Y-%m-%d"
        date_from = datetime.strptime(request.args.get("date_from", ""),
                                      date_fmt)
        dto = request.args.get("date_to")
        if dto:
            date_to = datetime.strptime(dto, date_fmt)
        else:
            # If only one date provided, assume it's a period spanning a single day
            date_to = date_from + timedelta(days=1)
    except Exception as e:
        logging.warning("Failed to parse date arg: {0}".format(e))
        return better_jsonify(**resp)

    # Fetch list of articles for each word for the given period
    wlist = list()
    colors = list(_LINE_COLORS)
    with SessionContext(read_only=True) as session:
        for wd, cat in words:
            q = (session.query(
                Article.id, Article.heading, Root.domain, Word.cnt,
                Word.stem).join(Article, Article.id == Word.article_id).filter(
                    Article.timestamp >= date_from).filter(
                        Article.timestamp < date_to).filter(
                            Word.stem == wd).filter(
                                Word.cat == cat).join(Root).order_by(
                                    desc(cast(Column, Article.timestamp))))
            articles = [{
                "id": a[0],
                "heading": a[1],
                "domain": a[2],
                "cnt": a[3]
            } for a in q.all()]
            wlist.append({
                "word": wd,
                "cat": cat,
                "cnt": sum(a["cnt"] for a in articles),
                "articles": articles,
                "color": colors.pop(0),
                "desc": _desc4word((wd, cat)),
            })

    resp["err"] = False
    resp["payload"] = render_template("words/details.html", words=wlist)
    return better_jsonify(**resp)
Ejemplo n.º 9
0
    def articles(
            cls,
            criteria: Mapping[str, Any],
            enclosing_session: Optional[Session] = None
    ) -> Iterator["Article"]:
        """ Generator of Article objects from the database that
            meet the given criteria """
        # The criteria are currently "timestamp", "author" and "domain",
        # as well as "order_by_parse" which if True indicates that the result
        # should be ordered with the most recently parsed articles first.
        with SessionContext(commit=True,
                            read_only=True,
                            session=enclosing_session) as session:

            # Only fetch articles that have a parse tree
            q: SqlQuery[ArticleRow] = session.query(ArticleRow).filter(
                ArticleRow.tree != None)

            # timestamp is assumed to contain a tuple: (from, to)
            if criteria and "timestamp" in criteria:
                ts = criteria["timestamp"]
                q = q.filter(ArticleRow.timestamp >= ts[0]).filter(
                    ArticleRow.timestamp < ts[1])

            if criteria and "author" in criteria:
                author = criteria["author"]
                q = q.filter(ArticleRow.author == author)

            if criteria and ("visible" in criteria or "domain" in criteria):
                # Need a join with Root for these criteria
                q = q.join(Root)
                if "visible" in criteria:
                    # Return only articles from roots with the specified visibility
                    visible = criteria["visible"]
                    assert isinstance(visible, bool)
                    q = q.filter(Root.visible == visible)
                if "domain" in criteria:
                    # Return only articles from the specified domain
                    domain = criteria["domain"]
                    assert isinstance(domain, str)
                    q = q.filter(Root.domain == domain)

            if criteria and criteria.get("order_by_parse"):
                # Order with newest parses first
                q = q.order_by(desc(cast(Column, ArticleRow.parsed)))

            parsed_after = criteria.get("parse_date_gt")
            if parsed_after is not None:
                q = q.filter(ArticleRow.parsed >= parsed_after)

            for arow in q.yield_per(500):
                yield cls._init_from_row(arow)
Ejemplo n.º 10
0
def suggest(limit=10):
    """ Return suggestions for query field autocompletion """
    limit = request.args.get("limit", limit)
    txt = request.args.get("q", "").strip()

    suggestions: List[Dict[str, str]] = []
    whois_prefix = "hver er "
    whatis_prefix = "hvað er "

    prefix = None
    if txt.lower().startswith(whois_prefix):
        prefix = whois_prefix
    elif txt.lower().startswith(whatis_prefix):
        prefix = whatis_prefix

    if not prefix:
        return better_jsonify(suggestions=suggestions)

    with SessionContext(read_only=True) as session:
        name = txt[len(prefix) :].strip()
        model_col = None

        # Hver er Jón Jónsson ?
        if prefix is whois_prefix and name[0].isupper():
            model_col = Person.name
        # Hver er seðlabankastjóri?
        elif prefix is whois_prefix:
            model_col = Person.title
        # Hvað er UNESCO?
        elif prefix is whatis_prefix:
            model_col = Entity.name

        assert model_col is not None

        q = (
            session.query(model_col, dbfunc.count(Article.id).label("total"))
            .filter(model_col.ilike(name + "%"))
            .join(Article)
            .group_by(model_col)
            .order_by(desc("total"))
            .limit(limit)
            .all()
        )

        prefix = prefix[:1].upper() + prefix[1:].lower()
        suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q]

    return better_jsonify(suggestions=suggestions)
Ejemplo n.º 11
0
 def last_answer(self, *, within_minutes=5):
     """ Return the last answer given to this client, by default
         within the last 5 minutes (0=forever) """
     if not self._client_id:
         # Can't find the last answer if no client_id given
         return None
     # Find the newest non-error, no-repeat query result for this client
     q = (self._session.query(QueryRow.answer, QueryRow.voice).filter(
         QueryRow.client_id == self._client_id).filter(
             QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
     if within_minutes > 0:
         # Apply a timestamp filter
         since = datetime.utcnow() - timedelta(minutes=within_minutes)
         q = q.filter(QueryRow.timestamp >= since)
     # Sort to get the newest query that fulfills the criteria
     last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
     return None if last is None else tuple(last)
Ejemplo n.º 12
0
def suggest(limit=10):
    """ Return suggestions for query field autocompletion """
    limit = request.args.get("limit", limit)
    txt = request.args.get("q", "").strip()

    suggestions = list()
    whois_prefix = "hver er "
    whatis_prefix = "hvað er "

    prefix = None
    if txt.lower().startswith(whois_prefix):
        prefix = whois_prefix
    elif txt.lower().startswith(whatis_prefix):
        prefix = whatis_prefix

    if not prefix:
        return better_jsonify(suggestions=suggestions)

    with SessionContext(read_only=True) as session:
        name = txt[len(prefix) :].strip()
        model_col = None

        # Hver er Jón Jónsson ?
        if prefix is whois_prefix and name[0].isupper():
            model_col = Person.name
        # Hver er seðlabankastjóri?
        elif prefix is whois_prefix:
            model_col = Person.title
        # Hvað er UNESCO?
        elif prefix is whatis_prefix:
            model_col = Entity.name

        q = (
            session.query(model_col, dbfunc.count(Article.id).label("total"))
            .filter(model_col.ilike(name + "%"))
            .join(Article)
            .group_by(model_col)
            .order_by(desc("total"))
            .limit(limit)
            .all()
        )

        prefix = prefix[:1].upper() + prefix[1:].lower()
        suggestions = [{"value": (prefix + p[0] + "?"), "data": ""} for p in q]

    return better_jsonify(suggestions=suggestions)
Ejemplo n.º 13
0
def parsefail():
    """ Handler for a page showing recent sentences where parsing failed """

    num = request.args.get("num", PARSEFAIL_DEFAULT)
    try:
        num = min(int(num), PARSEFAIL_MAX)
    except Exception:
        num = PARSEFAIL_DEFAULT

    with SessionContext(read_only=True) as session:
        q = (
            session.query(Article.id, Article.timestamp, Article.tokens)
            .filter(Article.tree != None)
            .filter(Article.timestamp != None)
            .filter(Article.timestamp <= datetime.utcnow())
            .filter(Article.heading > "")
            .filter(Article.num_sentences > 0)
            .filter(Article.num_sentences != Article.num_parsed)
            .order_by(desc(Article.timestamp))
            .limit(num)
        )

        sfails = []

        for a in q.all():
            try:
                tokens = json.loads(a.tokens)
            except Exception:
                continue
            # Paragraphs
            for p in tokens:
                # Sentences
                for s in p:
                    # Tokens
                    for t in s:
                        if "err" in t:
                            # Only add well-formed sentences that start
                            # with a capital letter and end with a period
                            if s[0]["x"][0].isupper() and s[-1]["x"] == ".":
                                sfails.append([s])
                                break

    return render_template(
        "parsefail.html", title="Ógreindar setningar", sentences=sfails, num=num
    )
Ejemplo n.º 14
0
 def fetch_context(self, *, within_minutes=10):
     """ Return the context from the last answer given to this client,
         by default within the last 10 minutes (0=forever) """
     if not self._client_id:
         # Can't find the last answer if no client_id given
         return None
     # Find the newest non-error, no-repeat query result for this client
     q = (self._session.query(QueryRow.context).filter(
         QueryRow.client_id == self._client_id).filter(
             QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
     if within_minutes > 0:
         # Apply a timestamp filter
         since = datetime.utcnow() - timedelta(minutes=within_minutes)
         q = q.filter(QueryRow.timestamp >= since)
     # Sort to get the newest query that fulfills the criteria
     ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
     if ctx is None:
         return None
     # This function normally returns a dict that has been decoded from JSON
     return None if ctx is None else ctx[0]
Ejemplo n.º 15
0
def parsefail():
    """ Handler for a page showing recent sentences where parsing failed """

    num = request.args.get("num", PARSEFAIL_DEFAULT)
    try:
        num = min(int(num), PARSEFAIL_MAX)
    except:
        num = PARSEFAIL_DEFAULT

    with SessionContext(read_only=True) as session:
        q = (
            session.query(Article.id, Article.timestamp, Article.tokens)
            .filter(Article.tree != None)
            .filter(Article.timestamp != None)
            .filter(Article.timestamp <= datetime.utcnow())
            .filter(Article.heading > "")
            .filter(Article.num_sentences > 0)
            .filter(Article.num_sentences != Article.num_parsed)
            .order_by(desc(Article.timestamp))
            .limit(num)
        )

        sfails = []

        for a in q.all():
            tokens = json.loads(a.tokens)
            # Paragraphs
            for p in tokens:
                # Sentences
                for s in p:
                    # Tokens
                    for t in s:
                        if "err" in t:
                            # Only add well-formed sentences that start
                            # with a capital letter and end with a period
                            if s[0]["x"][0].isupper() and s[-1]["x"] == ".":
                                sfails.append([s])
                                break

    return render_template("parsefail.html", sentences=json.dumps(sfails), num=num)
Ejemplo n.º 16
0
def fetch_articles(
    topic=None,
    offset=0,
    limit=_DEFAULT_NUM_ARTICLES,
    start=None,
    location=None,
    country=None,
    root=None,
    author=None,
    enclosing_session=None,
):
    """ Return a list of articles in chronologically reversed order.
        Articles can be filtered by start date, location, country, root etc. """
    toplist = []

    with SessionContext(read_only=True, session=enclosing_session) as session:
        q = (session.query(Article).filter(Article.tree != None).filter(
            Article.timestamp != None).filter(
                Article.timestamp <= datetime.utcnow()).filter(
                    Article.heading > "").filter(
                        Article.num_sentences > 0).join(Root).filter(
                            Root.visible == True))

        # Filter by date
        if start is not None:
            q = q.filter(Article.timestamp > start)

        if location or country:
            q = q.join(Location)
            if location:
                # Filter by location
                q = q.filter(Location.name == location)
            if country:
                # Filter by country code
                q = q.filter(Location.country == country)

        # Filter by source (root) using domain (e.g. "kjarninn.is")
        if root:
            q = q.filter(Root.domain == root)

        # Filter by author name
        if author:
            q = q.filter(Article.author == author)

        # Filter by topic identifier
        if topic:
            q = q.join(ArticleTopic).join(Topic).filter(
                Topic.identifier == topic)

        q = q.order_by(desc(Article.timestamp)).offset(offset).limit(limit)

        class ArticleDisplay:
            """ Utility class to carry information about an article to the web template """
            def __init__(
                self,
                heading,
                timestamp,
                url,
                uuid,
                num_sentences,
                num_parsed,
                icon,
                localized_date,
                source,
            ):
                self.heading = heading
                self.timestamp = timestamp
                self.url = url
                self.uuid = uuid
                self.num_sentences = num_sentences
                self.num_parsed = num_parsed
                self.icon = icon
                self.localized_date = localized_date
                self.source = source

            @property
            def width(self):
                """ The ratio of parsed sentences to the total number of sentences,
                    expressed as a percentage string """
                if self.num_sentences == 0:
                    return "0%"
                return "{0}%".format(
                    (100 * self.num_parsed) // self.num_sentences)

            @property
            def time(self):
                return self.timestamp.isoformat()[11:16]

            @property
            def date(self):
                if datetime.today().year == self.timestamp.year:
                    return self.localized_date
                return self.fulldate

            @property
            def fulldate(self):
                return self.localized_date + self.timestamp.strftime(" %Y")

        with changedlocale(category="LC_TIME"):
            for a in q:
                # Instantiate article objects from results
                source = a.root.domain
                icon = source + ".png"
                locdate = a.timestamp.strftime("%-d. %b")

                d = ArticleDisplay(
                    heading=a.heading,
                    timestamp=a.timestamp,
                    url=a.url,
                    uuid=a.id,
                    num_sentences=a.num_sentences,
                    num_parsed=a.num_parsed,
                    icon=icon,
                    localized_date=locdate,
                    source=source,
                )
                toplist.append(d)

    return toplist
Ejemplo n.º 17
0
def index():
    return render_template('index.html',
                           snatched=db.Snatched.select().order_by(
                               db.desc(db.Snatched.date)).limit(20),
                           announced=db.Announced.select().order_by(
                               db.desc(db.Announced.date)).limit(20))
Ejemplo n.º 18
0
def process_query(q,
                  voice,
                  *,
                  auto_uppercase=False,
                  location=None,
                  remote_addr=None,
                  client_id=None,
                  client_type=None,
                  client_version=None,
                  bypass_cache=False,
                  private=False):
    """ Process an incoming natural language query.
        If voice is True, return a voice-friendly string to
        be spoken to the user. If auto_uppercase is True,
        the string probably came from voice input and we
        need to intelligently guess which words in the query
        should be upper case (to the extent that it matters).
        The q parameter can either be a single query string
        or an iterable of strings that will be processed in
        order until a successful one is found. """

    now = datetime.utcnow()
    result = None
    client_id = client_id[:256] if client_id else None
    first_clean_q = None
    first_qtext = None

    with SessionContext(commit=True) as session:

        if isinstance(q, str):
            # This is a single string
            it = [q]
        else:
            # This should be an array of strings,
            # in decreasing priority order
            it = q

        # Iterate through the submitted query strings,
        # assuming that they are in decreasing order of probability,
        # attempting to execute them in turn until we find
        # one that works (or we're stumped)

        for qtext in it:

            qtext = qtext.strip()
            clean_q = qtext.rstrip("?")
            if first_clean_q is None:
                # Store the first (most likely) query string
                # that comes in from the speech-to-text processor,
                # since we want to return that one to the client
                # if no query string is matched - not the last
                # (least likely) query string
                first_clean_q = clean_q
                first_qtext = qtext
            # First, look in the query cache for the same question
            # (in lower case), having a not-expired answer
            cached_answer = None
            if voice and not bypass_cache:
                # Only use the cache for voice queries
                # (handling detailed responses in other queries
                # is too much for the cache)
                cached_answer = (session.query(QueryRow).filter(
                    QueryRow.question_lc == clean_q.lower()).filter(
                        QueryRow.expires >= now).order_by(
                            desc(QueryRow.expires)).limit(1).one_or_none())
            if cached_answer is not None:
                # The same question is found in the cache and has not expired:
                # return the previous answer
                a = cached_answer
                result = dict(
                    valid=True,
                    q_raw=qtext,
                    q=a.bquestion,
                    answer=a.answer,
                    response=dict(answer=a.answer or ""),
                    voice=a.voice,
                    expires=a.expires,
                    qtype=a.qtype,
                    key=a.key,
                )
                # !!! TBD: Log the cached answer as well?
                return result
            query = Query(session, qtext, voice, auto_uppercase, location,
                          client_id)
            result = query.execute()
            if result["valid"] and "error" not in result:
                # Successful: our job is done
                if not private:
                    # If not in private mode, log the result
                    try:
                        qrow = QueryRow(
                            timestamp=now,
                            interpretations=it,
                            question=clean_q,
                            # bquestion is the beautified query string
                            bquestion=result["q"],
                            answer=result["answer"],
                            voice=result.get("voice"),
                            # Only put an expiration on voice queries
                            expires=query.expires if voice else None,
                            qtype=result.get("qtype"),
                            key=result.get("key"),
                            latitude=location[0] if location else None,
                            longitude=location[1] if location else None,
                            # Client identifier
                            client_id=client_id,
                            client_type=client_type or None,
                            client_version=client_version or None,
                            # IP address
                            remote_addr=remote_addr or None,
                            # Context dict, stored as JSON, if present
                            # (set during query execution)
                            context=query.context,
                            # All other fields are set to NULL
                        )
                        session.add(qrow)
                    except Exception as e:
                        logging.error("Error logging query: {0}".format(e))
                return result

        # Failed to answer the query, i.e. no query processor
        # module was able to parse the query and provide an answer
        result = result or dict(valid=False, error="E_NO_RESULT")
        if first_clean_q:
            # Re-insert the query data from the first (most likely)
            # string returned from the speech-to-text processor,
            # replacing residual data that otherwise would be there
            # from the last (least likely) query string
            result["q_raw"] = first_qtext
            result["q"] = beautify_query(first_qtext)
            # Attempt to include a helpful response in the result
            Query.try_to_help(first_clean_q, result)

            # Log the failure
            qrow = QueryRow(
                timestamp=now,
                interpretations=it,
                question=first_clean_q,
                bquestion=result["q"],
                answer=result.get("answer"),
                voice=result.get("voice"),
                error=result.get("error"),
                latitude=location[0] if location else None,
                longitude=location[1] if location else None,
                # Client identifier
                client_id=client_id,
                client_type=client_type or None,
                client_version=client_version or None,
                # IP address
                remote_addr=remote_addr or None
                # All other fields are set to NULL
            )
            session.add(qrow)

        return result
Ejemplo n.º 19
0
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD):
    """ Return a list of recent locations along with the list of
        articles in which they are mentioned """

    with SessionContext(read_only=True) as session:
        q = (
            session.query(
                Location.name,
                Location.kind,
                Location.country,
                Location.article_url,
                Location.latitude,
                Location.longitude,
                Article.id,
                Article.heading,
                Root.domain,
            )
            .join(Article, Article.url == Location.article_url)
            .filter(Article.timestamp > datetime.utcnow() - timedelta(days=days))
            .join(Root)
            .filter(Root.visible)
        )

        # Filter by kind
        if kind:
            q = q.filter(Location.kind == kind)

        q = q.order_by(desc(Article.timestamp))

        # Group articles by unique location
        locs = defaultdict(list)
        for r in q.all():
            article = {
                "url": r.article_url,
                "id": r.id,
                "heading": r.heading,
                "domain": r.domain,
            }
            k = (r.name, r.kind, r.country, r.latitude, r.longitude)
            locs[k].append(article)

        # Create top locations list sorted by article count
        loclist = []
        for k, v in locs.items():
            (name, kind, country, lat, lon) = k  # Unpack tuple key
            # Google map links currently use the placename instead of
            # coordinates. This works well for most Icelandic and
            # international placenames, but fails on some.
            map_url = GMAPS_PLACE_URL.format(name)
            # if lat and lon:
            #     map_url = GMAPS_COORD_URL.format(lat, lon, "7z")

            loclist.append(
                {
                    "name": name,
                    "kind": kind,
                    "country": country,
                    "map_url": map_url,
                    "articles": v,
                }
            )
        loclist.sort(key=lambda x: len(x["articles"]), reverse=True)

        return loclist[:limit]