Beispiel #1
0
    def make_tree(text: str) -> Tree:
        toklist = tokenize(text)
        fp = Fast_Parser(verbose=False)
        ip = IncrementalParser(fp, toklist, verbose=False)
        # Dict of parse trees in string dump format,
        # stored by sentence index (1-based)
        trees = OrderedDict()
        num_sent = 0
        for p in ip.paragraphs():
            for sent in p.sentences():
                num_sent += 1
                num_tokens = len(sent)
                assert sent.parse(), "Sentence does not parse: " + sent.text
                # Obtain a text representation of the parse tree
                token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
                # Create a verbose text representation of
                # the highest scoring parse tree
                tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
                # Add information about the sentence tree's score
                # and the number of tokens
                trees[num_sent] = "\n".join(
                    ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
                )
        # Create a tree representation string out of
        # all the accumulated parse trees
        tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())

        tree = Tree()
        tree.load(tree_string)
        return tree
Beispiel #2
0
    def all_matches(
        cls,
        criteria: Mapping[str, Any],
        pattern: str,
        enclosing_session: Optional[Session] = None,
    ) -> Iterator[Tuple["Article", int, SimpleTree]]:
        """ Generator of SimpleTree objects (see matcher.py) from
            articles matching the given criteria and the pattern """

        with SessionContext(commit=True,
                            read_only=True,
                            session=enclosing_session) as session:

            # t0 = time.time()
            mcnt = acnt = tcnt = 0
            # print("Starting article loop")
            for a in cls.articles(criteria, enclosing_session=session):
                if a.tree is None:
                    continue
                acnt += 1
                tree = Tree(url=a.url or "", authority=a.authority)
                tree.load(a.tree)
                for ix, simple_tree in tree.simple_trees():
                    tcnt += 1
                    for match in simple_tree.all_matches(pattern):
                        yield (a, ix, match)
                        mcnt += 1
Beispiel #3
0
def gen_simple_trees(criteria):
    """ Generate simplified parse trees from articles matching the criteria """
    for a in Article.articles(criteria):
        # Skip articles from certain websites
        if (
            not a.root_domain
            or "raduneyti" in a.root_domain
            or "lemurinn" in a.root_domain
        ):
            continue

        # Load tree from article
        try:
            tree = Tree(url=a.url, authority=a.authority)
            tree.load(a.tree)
        except Exception as e:
            print("Exception loading tree in {0}: {1}".format(a.url, e))
            continue

        # Yield simple trees
        for ix, stree in tree.simple_trees():
            text = stree.text
            tokens = text.split()
            if len(tokens) >= MIN_SENT_LENGTH:
                wordset = set([t.lower() for t in tokens])
                # Only return sentences without our bag of English words
                if not (wordset & ENGLISH_WORDS):
                    yield stree, tree.score(ix), tree.length(ix), a.uuid, a.url, ix
Beispiel #4
0
    def go_single(self, url: str) -> None:
        """ Single article processor that will be called by a process within a
            multiprocessing pool """

        assert self._db is not None

        print("Processing article {0}".format(url))
        sys.stdout.flush()

        # If first article within a new process, import the processor modules
        if self.pmodules is None:
            self.pmodules = [
                importlib.import_module(modname) for modname in self.processors
            ]

        # Load the article
        with closing(self._db.session) as session:

            try:
                article = session.query(Article).filter_by(
                    url=url).one_or_none()

                if article is None:
                    print("Article not found in scraper database")
                else:
                    if article.tree and article.tokens:
                        tree = Tree(url, article.authority)
                        tree.load(article.tree)

                        token_container = TokenContainer(
                            article.tokens, url, article.authority)

                        # Run all processors in turn
                        for p in self.pmodules:
                            ptype = getattr(p, "PROCESSOR_TYPE")  # type: str
                            if ptype == "tree":
                                tree.process(session, p)
                            elif ptype == "token":
                                token_container.process(session, p)
                            else:
                                assert False, (
                                    "Unknown processor type '{0}'; should be 'tree' or 'token'"
                                    .format(ptype))

                    # Mark the article as being processed
                    article.processed = datetime.utcnow()

                # So far, so good: commit to the database
                session.commit()

            except Exception as e:
                # If an exception occurred, roll back the transaction
                session.rollback()
                print(
                    "Exception in article {0}, transaction rolled back\nException: {1}"
                    .format(url, e))
                raise

        sys.stdout.flush()
Beispiel #5
0
def gen_simple_trees(criteria, stats):
    """ Generate simplified parse trees from articles matching the criteria """
    for a in Article.articles(criteria):
        if not a.root_domain or "raduneyti" in a.root_domain:
            # Skip ministry websites due to amount of chaff found there
            continue
        tree = Tree(url = a.url, authority = a.authority)
        # Note the parse timestamp
        stats["parsed"] = a.parsed
        tree.load(a.tree)
        for ix, stree in tree.simple_trees():
            yield stree, tree.score(ix), tree.length(ix)
Beispiel #6
0
def gen_simple_trees(criteria, stats):
    """ Generate simplified parse trees from articles matching the criteria """
    for a in Article.articles(criteria):
        if not a.root_domain or "raduneyti" in a.root_domain:
            # Skip ministry websites due to amount of chaff found there
            continue
        tree = Tree(url=a.url, authority=a.authority)
        # Note the parse timestamp
        stats["parsed"] = a.parsed
        tree.load(a.tree)
        for ix, stree in tree.simple_trees():
            yield stree, tree.score(ix), tree.length(ix)
Beispiel #7
0
    def go_single(self, url):
        """ Single article processor that will be called by a process within a
            multiprocessing pool """

        print("Processing article {0}".format(url))
        sys.stdout.flush()

        # If first article within a new process, import the processor modules
        if self.pmodules is None:
            self.pmodules = [
                importlib.import_module(modname) for modname in self.processors
            ]

        # Load the article
        with closing(self._db.session) as session:

            try:
                article = session.query(Article).filter_by(url=url).one_or_none()

                if article is None:
                    print("Article not found in scraper database")
                else:
                    if article.tree and article.tokens:
                        tree = Tree(url, article.authority)
                        tree.load(article.tree)

                        token_container = TokenContainer(article.tokens, url)

                        # Run all processors in turn
                        for p in self.pmodules:
                            if p.PROCESSOR_TYPE == "tree":
                                tree.process(session, p)
                            elif p.PROCESSOR_TYPE == "token":
                                token_container.process(session, p)

                    # Mark the article as being processed
                    article.processed = datetime.utcnow()

                # So far, so good: commit to the database
                session.commit()

            except Exception as e:
                # If an exception occurred, roll back the transaction
                session.rollback()
                print(
                    "Exception in article {0}, transaction rolled back\nException: {1}".format(
                        url, e
                    )
                )
                raise

        sys.stdout.flush()
Beispiel #8
0
    def go_single(self, url):
        """ Single article processor that will be called by a process within a
            multiprocessing pool """

        print("Processing article {0}".format(url))
        sys.stdout.flush()

        # If first article within a new process, import the processor modules
        if self.pmodules is None:
            self.pmodules = [
                importlib.import_module(modname) for modname in self.processors
            ]

        # Load the article
        with closing(self._db.session) as session:

            try:

                article = session.query(Article).filter_by(
                    url=url).one_or_none()

                if article is None:
                    print("Article not found in scraper database")
                else:
                    if article.tree:
                        tree = Tree(url, article.authority)
                        # print("Tree:\n{0}\n".format(article.tree))
                        tree.load(article.tree)

                        # Run all processors in turn
                        for p in self.pmodules:
                            tree.process(session, p)

                    # Mark the article as being processed
                    article.processed = datetime.utcnow()

                # So far, so good: commit to the database
                session.commit()

            except Exception as e:
                # If an exception occurred, roll back the transaction
                session.rollback()
                print(
                    "Exception in article {0}, transaction rolled back\nException: {1}"
                    .format(url, e))
                raise

        sys.stdout.flush()
Beispiel #9
0
class ApiFs(LoggingMixIn, Operations):
    'Api filesystem for http://korchasa.host'

    def __init__(self, loader):
        self.tree = Tree()
        self.loader = loader
        self.fd = 0

    def create(self, path, mode=0o644):
        self.files[path] = self._file(mode)
        self.fd += 1
        return self.fd

    getxattr = None

    def getattr(self, path, fh=None):
        if len(path) > 1 and path[1] == '.':
            raise FuseOSError(ENOENT)
        node = self.tree.load(path, self.loader)
        if node:
            return self._fs_node(node)
        else:
            raise FuseOSError(ENOENT)

    def open(self, path, flags):
        self.fd += 1
        return self.fd

    def read(self, path, size, offset, fh):
        return self.tree.node(path)['data'][offset:offset + size]

    def readdir(self, path, fh):
        node = self.tree.nearest(path)
        if not node.get('loaded'):
            self.loader(path, self.tree)
        children = ['.', '..'] + [url.lstrip('/') for url in list(self.tree.children(path).keys()) if url != '/']

        return children

    def statfs(self, path):
        return dict(f_bsize=512, f_blocks=4096, f_bavail=2048)

    def _fs_node(self, tree_node):
        if tree_node.get('dir'):
            return dict(
                st_mode=(S_IFDIR | 0o755),
                st_ctime=time(),
                st_mtime=time(),
                st_atime=time(),
                st_nlink=2
            )
        else:
            return dict(
                st_mode=(S_IFREG | 0o644),
                st_nlink=1,
                st_size=len(tree_node.get('data')),
                st_ctime=time(),
                st_mtime=time(),
                st_atime=time()
            )
Beispiel #10
0
    def all_matches(cls, criteria, pattern, enclosing_session=None):
        """ Generator of SimpleTree objects (see matcher.py) from articles matching
            the given criteria and the pattern """

        with SessionContext(commit=True,
                            read_only=True,
                            session=enclosing_session) as session:

            # t0 = time.time()
            mcnt = acnt = tcnt = 0
            # print("Starting article loop")
            for a in cls.articles(criteria, enclosing_session=session):
                acnt += 1
                tree = Tree(url=a.url, authority=a.authority)
                tree.load(a.tree)
                for ix, simple_tree in tree.simple_trees():
                    tcnt += 1
                    for match in simple_tree.all_matches(pattern):
                        yield (a, ix, match)
                        mcnt += 1
Beispiel #11
0
def _make_tree(text: str) -> Tree:
    """Tokenize and parse text, create tree representation string
    from all the parse trees, return Tree object and token JSON."""
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)

    pgs = []
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        pgs.append([])
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse: " + sent.text
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            assert sent.tree is not None
            tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]
            )
            pgs[-1].append(token_dicts)
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items())
    tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False)

    tree = Tree()
    tree.load(tree_string)
    return tree, tokens_json
Beispiel #12
0
class Query:
    """ A Query is initialized by parsing a query string using QueryRoot as the
        grammar root nonterminal. The Query can then be executed by processing
        the best parse tree using the nonterminal handlers given above, returning a
        result object if successful. """
    def __init__(self, session):
        self._session = session
        self._error = None
        self._answer = None
        self._tree = None
        self._qtype = None
        self._key = None

    @staticmethod
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        #ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees

    def parse(self, toklist, result):
        """ Parse the token list as a query, returning True if valid """

        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        #print("Query tree:\n{0}".format(tree_string))
        self._tree = Tree()
        self._tree.load(tree_string)
        return True

    def execute(self):
        """ Execute the query contained in the previously parsed tree; return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False

        self._error = None
        self._qtype = None
        with closing(BIN_Db.get_db()) as bin_db:

            # Process the tree, which has only one sentence
            self._tree.process(self._session, _THIS_MODULE, bin_db, query=self)

        return self._error is None

    def set_qtype(self, qtype):
        """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """
        self._qtype = qtype

    def set_answer(self, answer):
        """ Set the answer to the query """
        self._answer = answer

    def set_key(self, key):
        """ Set the query key, i.e. the term or string used to execute the query """
        # This is for instance a person name in nominative case
        self._key = key

    def set_error(self, error):
        """ Set an error result """
        self._error = error

    def qtype(self):
        """ Return the query type """
        return self._qtype

    def answer(self):
        """ Return the query answer """
        return self._answer

    def key(self):
        """ Return the query key """
        return self._key

    def error(self):
        """ Return the query error, if any """
        return self._error
Beispiel #13
0
def test_entities():
    text = """

       Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota.

       Danska byggingavörukeðjan Bygma hefur keypt íslenska
       verslunarfyrirtækið Húsasmiðjuna.

       Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs
       eru hluthafar í Arion banka.

       Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra.
       Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám.
       Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær.
       Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær.

       Primera Air var íslenskt flugfélag.
       Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag.

       Villeneuve-Loubet er franskt þorp.

       Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda.
       
       Í miðbæ Reykjavíkur er herrafataverslunin Geysir.

       Mér er sagt að Geysir sé hættur að gjósa.
       
       Geysir er hættur að gjósa.
       
       Geysir er gamall goshver.
       
       Fyrirtækið Apple-búðin selur Apple Mac tölvur.
       Fyrirtækið Origo selur IBM tölvur.
       
       Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
       
    """
    toklist = tokenize(text)
    fp = Fast_Parser(verbose=False)
    ip = IncrementalParser(fp, toklist, verbose=False)
    # Dict of parse trees in string dump format,
    # stored by sentence index (1-based)
    trees = OrderedDict()
    num_sent = 0
    for p in ip.paragraphs():
        for sent in p.sentences():
            num_sent += 1
            num_tokens = len(sent)
            assert sent.parse(), "Sentence does not parse"
            # Obtain a text representation of the parse tree
            token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
            # Create a verbose text representation of
            # the highest scoring parse tree
            tree = ParseForestDumper.dump_forest(sent.tree,
                                                 token_dicts=token_dicts)
            # Add information about the sentence tree's score
            # and the number of tokens
            trees[num_sent] = "\n".join(
                ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree])
    # Create a tree representation string out of
    # all the accumulated parse trees
    tree_string = "".join("S{0}\n{1}\n".format(key, val)
                          for key, val in trees.items())

    tree = Tree()
    tree.load(tree_string)

    session = SessionShim()
    tree.process(session, entities)

    session.check(("Bygma", "er", "dönsk byggingavörukeðja"))
    session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki"))
    session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður"))
    session.check(
        ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður"))
    session.check(("Primera Air", "var", "íslenskt flugfélag"))
    session.check(("Villeneuve-Loubet", "er", "franskt þorp"))
    session.check(("Valdís", "er", "ísbúð"))
    session.check(("Fosshótel", "var", "rekin með tapi"))
    session.check(("Fosshótel", "er", "stór hótelkeðja"))
    session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði"))
    session.check(("Lax", "er", "stór fiskur af ætt laxfiska"))
    session.check(("Geysir", "er", "gamall goshver"))
    session.check(("Eimskipafélag Íslands hf", "er", "skipafélag"))
    session.check(("Origo", "er", "fyrirtæki"))
    session.check(("AirBerlin", "er", "flugfélag"))

    assert session.is_empty()
Beispiel #14
0
class Query:
    """ A Query is initialized by parsing a query string using QueryRoot as the
        grammar root nonterminal. The Query can then be executed by processing
        the best parse tree using the nonterminal handlers given above, returning a
        result object if successful. """

    # Processors that handle parse trees
    _tree_processors: List[ModuleType] = []
    # Handler functions within processors that handle plain text
    _text_processors: List[Callable[["Query"], bool]] = []
    # Singleton instance of the query parser
    _parser: Optional[QueryParser] = None
    # Help texts associated with lemmas
    _help_texts: Dict[str, List[Callable]] = dict()

    def __init__(
        self,
        session,  # SQLAlchemy session
        query: str,
        voice: bool,
        auto_uppercase: bool,
        location: Optional[LocationType],
        client_id: Optional[str],
        client_type: Optional[str],
    ) -> None:

        self._query = q = self._preprocess_query_string(query)
        self._session = session
        self._location = location
        # Prepare a "beautified query" string that can be
        # shown in a client user interface. By default, this
        # starts with an uppercase letter and ends with a
        # question mark, but this can be modified during the
        # processing of the query.
        self.set_beautified_query(beautify_query(q))
        self._voice = voice
        self._auto_uppercase = auto_uppercase
        self._error: Optional[str] = None
        # A detailed answer, which can be a list or a dict
        self._response: Optional[ResponseMapping] = None
        # A single "best" displayable text answer
        self._answer: Optional[str] = None
        # A version of self._answer that can be
        # fed to a voice synthesizer
        self._voice_answer: Optional[str] = None
        self._tree: Optional[Tree] = None
        self._qtype: Optional[str] = None
        self._key: Optional[str] = None
        self._toklist: Optional[List[Tok]] = None
        # Expiration timestamp, if any
        self._expires: Optional[datetime] = None
        # URL assocated with query, can be set by query response handler
        # and subsequently provided to the remote client
        self._url: Optional[str] = None
        # Command returned by query
        self._command: Optional[str] = None
        # Client id, if known
        self._client_id = client_id
        # Client type, if known
        self._client_type = client_type
        # Source of answer to query
        self._source: Optional[str] = None
        # Query context, which is None until fetched via self.fetch_context()
        # This should be a dict that can be represented in JSON
        self._context: Optional[ContextDict] = None

    def _preprocess_query_string(self, q: str) -> str:
        """ Preprocess the query string prior to further analysis """
        if not q:
            return q
        qf = re.sub(_IGNORED_PREFIX_RE, "", q, flags=re.IGNORECASE)
        # Remove " embla" suffix, if present
        qf = re.sub(r"\s+embla$", "", qf, flags=re.IGNORECASE)
        # Fix common Google ASR mistake: 'hæ embla' is returned as 'bæjarblað'
        if not qf and q == "bæjarblað":
            q = "hæ embla"
        # If stripping the prefixes results in an empty query,
        # just return original query string unmodified.
        return qf or q

    @classmethod
    def init_class(cls) -> None:
        """ Initialize singleton data, i.e. the list of query
            processor modules and the query parser instance """
        all_procs = []
        tree_procs = []
        text_procs = []
        # Load the query processor modules found in the
        # queries directory. The modules can be tree and/or text processors,
        # and we sort them into two lists, accordingly.
        modnames = modules_in_dir("queries")
        for modname in sorted(modnames):
            try:
                m = importlib.import_module(modname)
                all_procs.append(m)
                if getattr(m, "HANDLE_TREE", False):
                    # This is a tree processor
                    tree_procs.append(m)
                handle_plain_text = getattr(m, "handle_plain_text", None)
                if handle_plain_text is not None:
                    # This is a text processor:
                    # store a reference to its handler function
                    text_procs.append(handle_plain_text)
            except ImportError as e:
                logging.error(
                    "Error importing query processor module {0}: {1}".format(
                        modname, e))
        cls._tree_processors = tree_procs
        cls._text_processors = text_procs

        # Obtain query grammar fragments from the tree processors
        grammar_fragments = []
        for processor in tree_procs:
            # Check whether this tree processor supplies a query grammar fragment
            fragment = getattr(processor, "GRAMMAR", None)
            if fragment and isinstance(fragment, str):
                # Looks legit: add it to our list
                grammar_fragments.append(fragment)

        # Collect topic lemmas that can be used to provide
        # context-sensitive help texts when queries cannot be parsed
        help_texts = defaultdict(list)
        for processor in all_procs:
            # Collect topic lemmas and corresponding help text functions
            topic_lemmas = getattr(processor, "TOPIC_LEMMAS", None)
            if topic_lemmas:
                help_text_func = getattr(processor, "help_text", None)
                # If topic lemmas are given, a help_text function
                # should also be present
                assert help_text_func is not None
                if help_text_func is not None:
                    for lemma in topic_lemmas:
                        help_texts[lemma].append(help_text_func)
        cls._help_texts = help_texts

        # Coalesce the grammar additions from the fragments
        grammar_additions = "\n".join(grammar_fragments)
        # Initialize a singleton parser instance for queries,
        # with the nonterminal 'QueryRoot' as the grammar root
        cls._parser = QueryParser(grammar_additions)

    @staticmethod
    def _parse(toklist: Iterable[Tok]) -> Tuple[ResponseDict, Dict[int, str]]:
        """ Parse a token list as a query """
        bp = Query._parser
        assert bp is not None
        num_sent = 0
        num_parsed_sent = 0
        rdc = Reducer(bp.grammar)
        trees: Dict[int, str] = dict()
        sent: List[Tok] = []

        for t in toklist:
            if t[0] == TOK.S_BEGIN:
                sent = []
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if not slen:
                    continue
                num_sent += 1
                # Parse the accumulated sentence
                num = 0
                try:
                    # Parse the sentence
                    forest = bp.go(sent)
                    if forest is not None:
                        num = Fast_Parser.num_combinations(forest)
                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                except ParseError:
                    forest = None
                    num = 0
                if num > 0:
                    num_parsed_sent += 1
                    # Obtain a text representation of the parse tree
                    trees[num_sent] = ParseForestDumper.dump_forest(forest)

            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

        result: ResponseDict = dict(num_sent=num_sent,
                                    num_parsed_sent=num_parsed_sent)
        return result, trees

    @staticmethod
    def _query_string_from_toklist(toklist: Iterable[Tok]) -> str:
        """ Re-create a query string from an auto-capitalized token list """
        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            # Fix stuff that the auto-capitalization tends to get wrong,
            # such as 'í Dag'
            for wrong, correct in _CAPITALIZATION_REPLACEMENTS:
                actual_q = actual_q.replace(wrong, correct)
            # Capitalize the first letter of the query
            actual_q = actual_q[0].upper() + actual_q[1:]
            # Terminate the query with a question mark,
            # if not otherwise terminated
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"
        return actual_q

    def parse(self, result: ResponseDict) -> bool:
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        # Tokenize and auto-capitalize the query string
        toklist = list(
            tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()))

        actual_q = self._query_string_from_toklist(toklist)

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        # TODO: We might want to re-tokenize the actual_q string with
        # auto_uppercase=False, since we may have fixed capitalization
        # errors in _query_string_from_toklist()

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        if Settings.DEBUG:
            print(tree_string)
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True

    def execute_from_plain_text(self) -> bool:
        """ Attempt to execute a plain text query, without having to parse it """
        if not self._query:
            return False
        # Call the handle_plain_text() function in each text processor,
        # until we find one that returns True, or return False otherwise
        return any(
            handle_plain_text(self)
            for handle_plain_text in self._text_processors)

    def execute_from_tree(self) -> bool:
        """ Execute the query contained in the previously parsed tree;
            return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False
        for processor in self._tree_processors:
            self._error = None
            self._qtype = None
            # Process the tree, which has only one sentence
            try:
                self._tree.process(self._session, processor, query=self)
                if self._answer and self._error is None:
                    # The processor successfully answered the query
                    return True
            except Exception as e:
                logging.error(
                    f"Exception in execute_from_tree('{processor.__name__}') "
                    f"for query '{self._query}': {e}")
        # No processor was able to answer the query
        return False

    def last_answer(self,
                    *,
                    within_minutes: int = 5) -> Optional[Tuple[str, str]]:
        """ Return the last answer given to this client, by default
            within the last 5 minutes (0=forever) """
        if not self._client_id:
            # Can't find the last answer if no client_id given
            return None
        # Find the newest non-error, no-repeat query result for this client
        q = (self._session.query(QueryRow.answer, QueryRow.voice).filter(
            QueryRow.client_id == self._client_id).filter(
                QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
        if within_minutes > 0:
            # Apply a timestamp filter
            since = datetime.utcnow() - timedelta(minutes=within_minutes)
            q = q.filter(QueryRow.timestamp >= since)
        # Sort to get the newest query that fulfills the criteria
        last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
        return None if last is None else (last[0], last[1])

    def fetch_context(self,
                      *,
                      within_minutes: int = 10) -> Optional[ContextDict]:
        """ Return the context from the last answer given to this client,
            by default within the last 10 minutes (0=forever) """
        if not self._client_id:
            # Can't find the last answer if no client_id given
            return None
        # Find the newest non-error, no-repeat query result for this client
        q = (self._session.query(QueryRow.context).filter(
            QueryRow.client_id == self._client_id).filter(
                QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
        if within_minutes > 0:
            # Apply a timestamp filter
            since = datetime.utcnow() - timedelta(minutes=within_minutes)
            q = q.filter(QueryRow.timestamp >= since)
        # Sort to get the newest query that fulfills the criteria
        ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
        # This function normally returns a dict that has been decoded from JSON
        return None if ctx is None else ctx[0]

    @property
    def query(self) -> str:
        """ The query text, in its original form """
        return self._query

    @property
    def query_lower(self) -> str:
        """ The query text, all lower case """
        return self._query.lower()

    @property
    def beautified_query(self) -> str:
        """ Return the query string that will be reflected back to the client """
        return self._beautified_query

    def set_beautified_query(self, q: str) -> None:
        """ Set the query string that will be reflected back to the client """
        self._beautified_query = (
            q.replace("embla", "Embla").replace("miðeind", "Miðeind").replace(
                "Guðni Th ", "Guðni Th. ")  # By presidential request :)
        )

    def lowercase_beautified_query(self) -> None:
        """ If we know that no uppercase words occur in the query,
            except the initial capital, this function can be called
            to adjust the beautified query string accordingly. """
        self.set_beautified_query(self._beautified_query.capitalize())

    def query_is_command(self) -> None:
        """ Called from a query processor if the query is a command, not a question """
        # Put a period at the end of the beautified query text
        # instead of a question mark
        if self._beautified_query.endswith("?"):
            self._beautified_query = self._beautified_query[:-1] + "."

    @property
    def expires(self) -> Optional[datetime]:
        """ Expiration time stamp for this query answer, if any """
        return self._expires

    def set_expires(self, ts: datetime) -> None:
        """ Set an expiration time stamp for this query answer """
        self._expires = ts

    @property
    def url(self) -> Optional[str]:
        """ URL answer associated with this query """
        return self._url

    def set_url(self, u: str) -> None:
        """ Set the URL answer associated with this query """
        self._url = u

    @property
    def command(self) -> Optional[str]:
        """ JavaScript command associated with this query """
        return self._command

    def set_command(self, c: str) -> None:
        """ Set the JavaScript command associated with this query """
        self._command = c

    @property
    def source(self) -> Optional[str]:
        """ Return the source of the answer to this query """
        return self._source

    def set_source(self, s: str) -> None:
        """ Set the source for the answer to this query """
        self._source = s

    @property
    def location(self) -> Optional[LocationType]:
        """ The client location, if known, as a (lat, lon) tuple """
        return self._location

    @property
    def token_list(self) -> Optional[List[Tok]]:
        """ The original token list for the query """
        return self._toklist

    def qtype(self) -> Optional[str]:
        """ Return the query type """
        return self._qtype

    def set_qtype(self, qtype: str) -> None:
        """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """
        self._qtype = qtype

    def set_answer(self,
                   response: ResponseMapping,
                   answer: str,
                   voice_answer: Optional[str] = None) -> None:
        """ Set the answer to the query """
        # Detailed response (this is usually a dict)
        self._response = response
        # Single best answer, as a displayable string
        self._answer = answer
        # A voice version of the single best answer
        self._voice_answer = voice_answer

    def set_key(self, key: str) -> None:
        """ Set the query key, i.e. the term or string used to execute the query """
        # This is for instance a person name in nominative case
        self._key = key

    def set_error(self, error: str) -> None:
        """ Set an error result """
        self._error = error

    @property
    def is_voice(self) -> bool:
        """ Return True if this is a voice query """
        return self._voice

    @property
    def client_id(self) -> Optional[str]:
        return self._client_id

    @property
    def client_type(self) -> Optional[str]:
        """ Return client type string, e.g. "ios", "android", "www", etc. """
        return self._client_type

    def response(self) -> Optional[ResponseMapping]:
        """ Return the detailed query answer """
        return self._response

    def answer(self) -> Optional[str]:
        """ Return the 'single best' displayable query answer """
        return self._answer

    def voice_answer(self) -> Optional[str]:
        """ Return a voice version of the 'single best' answer, if any """
        return self._voice_answer

    def key(self) -> Optional[str]:
        """ Return the query key """
        return self._key

    def error(self) -> Optional[str]:
        """ Return the query error, if any """
        return self._error

    @property
    def context(self) -> Optional[ContextDict]:
        """ Return the context that has been set by self.set_context() """
        return self._context

    def set_context(self, ctx: ContextDict) -> None:
        """ Set a query context that will be stored and made available
            to the next query from the same client """
        self._context = ctx

    def client_data(self, key: str) -> Optional[ClientDataDict]:
        """ Fetch client_id-associated data stored in the querydata table """
        if not self.client_id:
            return None
        with SessionContext(read_only=True) as session:
            try:
                client_data = (session.query(QueryData).filter(
                    QueryData.key == key).filter(
                        QueryData.client_id == self.client_id)).one_or_none()
                return None if client_data is None else client_data.data
            except Exception as e:
                logging.error(
                    "Error fetching client '{0}' query data for key '{1}' from db: {2}"
                    .format(self.client_id, key, e))
        return None

    def set_client_data(self, key: str, data: ClientDataDict) -> None:
        """ Setter for client query data """
        if not self.client_id:
            logging.warning("Couldn't save query data, no client ID")
            return
        Query.store_query_data(self.client_id, key, data)

    @staticmethod
    def store_query_data(client_id: str, key: str,
                         data: ClientDataDict) -> bool:
        """ Save client query data in the database, under the given key """
        assert client_id and key
        now = datetime.utcnow()
        try:
            with SessionContext(commit=True) as session:
                row = (session.query(QueryData).filter(
                    QueryData.key == key).filter(
                        QueryData.client_id == client_id)).one_or_none()
                if row is None:
                    # Not already present: insert
                    row = QueryData(
                        client_id=client_id,
                        key=key,
                        created=now,
                        modified=now,
                        data=data,
                    )
                    session.add(row)
                else:
                    # Already present: update
                    row.data = data
                    row.modified = now
            # The session is auto-committed upon exit from the context manager
            return True
        except Exception as e:
            logging.error("Error storing query data in db: {0}".format(e))
        return False

    @classmethod
    def try_to_help(cls, query: str, result: ResponseDict) -> None:
        """ Attempt to help the user in the case of a failed query,
            based on lemmas in the query string """
        # Collect a set of lemmas that occur in the query string
        lemmas = set()
        with BIN_Db.get_db() as db:
            for token in query.lower().split():
                if token.isalpha():
                    m = db.meanings(token)
                    if not m:
                        # Try an uppercase version, just in case (pun intended)
                        m = db.meanings(token.capitalize())
                    if m:
                        lemmas |= set(mm.stofn.lower().replace("-", "")
                                      for mm in m)
        # Collect a list of potential help text functions from the query modules
        help_text_funcs = []
        for lemma in lemmas:
            help_text_funcs.extend([
                (lemma, help_text_func)
                for help_text_func in cls._help_texts.get(lemma, [])
            ])
        if help_text_funcs:
            # Found at least one help text func matching a lemma in the query
            # Select a function at random and invoke it with the matched
            # lemma as a parameter
            lemma, help_text_func = random.choice(help_text_funcs)
            result["answer"] = result["voice"] = help_text_func(lemma)
            result["valid"] = True

    def execute(self) -> ResponseDict:
        """ Check whether the parse tree is describes a query, and if so,
            execute the query, store the query answer in the result dictionary
            and return True """
        if Query._parser is None:
            Query.init_class()
        # By default, the result object contains the 'raw' query
        # string (the one returned from the speech-to-text processor)
        # as well as the beautified version of that string - which
        # usually starts with an uppercase letter and has a trailing
        # question mark (or other ending punctuation).
        result: ResponseDict = dict(q_raw=self.query, q=self.beautified_query)
        # First, try to handle this from plain text, without parsing:
        # shortcut to a successful, plain response
        if not self.execute_from_plain_text():
            if not self.parse(result):
                # Unable to parse the query
                err = self.error()
                if err is not None:
                    if Settings.DEBUG:
                        print("Unable to parse query, error {0}".format(err))
                    result["error"] = err
                result["valid"] = False
                return result
            if not self.execute_from_tree():
                # This is a query, but its execution failed for some reason:
                # return the error
                # if Settings.DEBUG:
                #     print("Unable to execute query, error {0}".format(q.error()))
                result["error"] = self.error() or "E_UNABLE_TO_EXECUTE_QUERY"
                result["valid"] = True
                return result
        # Successful query: return the answer in response
        if self._answer:
            result["answer"] = self._answer
        if self._voice and self._voice_answer:
            # This is a voice query and we have a voice answer to it
            result["voice"] = self._voice_answer
        if self._voice:
            # Optimize the response to voice queries:
            # we don't need detailed information about alternative
            # answers or their sources
            result["response"] = dict(answer=self._answer or "")
        elif self._response:
            # Return a detailed response if not a voice query
            result["response"] = self._response
        # Re-assign the beautified query string, in case the query processor modified it
        result["q"] = self.beautified_query
        # ...and the query type, as a string ('Person', 'Entity', 'Title' etc.)
        qt = self.qtype()
        if qt:
            result["qtype"] = qt
        # ...and the key used to retrieve the answer, if any
        key = self.key()
        if key:
            result["key"] = key
        # ...and a URL, if any has been set by the query processor
        if self.url:
            result["open_url"] = self.url
        # ...and a command, if any has been set
        if self.command:
            result["command"] = self.command
        # .. and the source, if set by query processor
        if self.source:
            result["source"] = self.source
        key = self.key()
        if not self._voice and qt == "Person" and key is not None:
            # For a person query, add an image (if available)
            img = get_image_url(key, enclosing_session=self._session)
            if img is not None:
                result["image"] = dict(
                    src=img.src,
                    width=img.width,
                    height=img.height,
                    link=img.link,
                    origin=img.origin,
                    name=img.name,
                )
        result["valid"] = True
        if Settings.DEBUG:
            # Dump query results to the console
            def converter(o):
                """ Ensure that datetime is output in ISO format to JSON """
                if isinstance(o, datetime):
                    return o.isoformat()[0:16]
                return None

            print("{0}".format(
                json.dumps(result,
                           indent=3,
                           ensure_ascii=False,
                           default=converter)))
        return result
Beispiel #15
0
def gen_simple_trees(criteria):
    """ Generate simplified parse trees from articles matching the criteria """
    bigset = set()
    for a in Article.articles(criteria):
        # Skip articles from certain websites
        if (
            not a.root_domain
            or "raduneyti" in a.root_domain
            or "lemurinn" in a.root_domain
        ):
            continue

        # Load tree from article
        try:
            tree = Tree(url=a.url, authority=a.authority)
            tree.load(a.tree)
        except Exception as e:
            print("Exception loading tree in {0}: {1}".format(a.url, e))
            # Skip it
            continue

        # Yield simple trees for each article sentence
        for ix, stree in tree.simple_trees():
            text = stree.text
            tokens = text.split()
            # Make sure it has enough tokens
            if not len(tokens) >= MIN_SENT_LENGTH:
                continue

            # Skip sentences containing something in our bag of English words
            wordset = set([t.lower() for t in tokens])
            if wordset & ENGLISH_WORDS:
                continue

            # Skip sentences that don't contain enough Icelandic words
            if unicelandic(stree):
                continue

            # Skip uncapitalized sentences
            if text[0].islower():
                continue

            # Skip sentences containing less than 3 word, entity or person tokens combined
            if len(list([x for x in stree.leaves() if x.kind in [TOK.WORD, TOK.ENTITY, TOK.PERSON]])) < 3:
                continue

            # Skip sentences with only a single NP -- S0→NP
            if stree.match("S0 > [NP $]"):
                continue

            # Skip sentences not containing a VP 
            if not stree.match("S0 >> VP"):
                continue

            # Skip sentences not ending in sentence ending punctuation
            if not text[-1] not in definitions.END_OF_SENTENCE:
                continue

            # Skip sentence if we have seen an equivalent sentence before
            hashnorm = hash(normalize(text))
            if hashnorm in bigset:
                continue
            else:
                bigset.add(hashnorm)

            yield stree, tree.score(ix), tree.length(ix), a.uuid, a.url, ix
Beispiel #16
0
class Query:

    """ A Query is initialized by parsing a query string using QueryRoot as the
        grammar root nonterminal. The Query can then be executed by processing
        the best parse tree using the nonterminal handlers given above, returning a
        result object if successful. """

    def __init__(self, session):
        self._session = session
        self._error = None
        self._answer = None
        self._tree = None
        self._qtype = None
        self._key = None
        self._toklist = None

    @staticmethod
    def _parse(toklist):
        """ Parse a token list as a query """

        # Parse with the nonterminal 'QueryRoot' as the grammar root
        with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp:

            sent_begin = 0
            num_sent = 0
            num_parsed_sent = 0
            rdc = Reducer(bp.grammar)
            trees = dict()
            sent = []

            for ix, t in enumerate(toklist):
                if t[0] == TOK.S_BEGIN:
                    sent = []
                    sent_begin = ix
                elif t[0] == TOK.S_END:
                    slen = len(sent)
                    if not slen:
                        continue
                    num_sent += 1
                    # Parse the accumulated sentence
                    num = 0
                    try:
                        # Parse the sentence
                        forest = bp.go(sent)
                        if forest is not None:
                            num = Fast_Parser.num_combinations(forest)
                            if num > 1:
                                # Reduce the resulting forest
                                forest = rdc.go(forest)
                    except ParseError as e:
                        forest = None
                    if num > 0:
                        num_parsed_sent += 1
                        # Obtain a text representation of the parse tree
                        trees[num_sent] = ParseForestDumper.dump_forest(forest)
                        # ParseForestPrinter.print_forest(forest)

                elif t[0] == TOK.P_BEGIN:
                    pass
                elif t[0] == TOK.P_END:
                    pass
                else:
                    sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees

    def parse(self, toklist, result):
        """ Parse the token list as a query, returning True if valid """

        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        # print("Query tree:\n{0}".format(tree_string))
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True

    def execute(self):
        """ Execute the query contained in the previously parsed tree;
            return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False

        self._error = None
        self._qtype = None
        # Process the tree, which has only one sentence
        self._tree.process(self._session, _THIS_MODULE, query=self)

        return self._error is None

    def set_qtype(self, qtype):
        """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """
        self._qtype = qtype

    def set_answer(self, answer):
        """ Set the answer to the query """
        self._answer = answer

    def set_key(self, key):
        """ Set the query key, i.e. the term or string used to execute the query """
        # This is for instance a person name in nominative case
        self._key = key

    def set_error(self, error):
        """ Set an error result """
        self._error = error

    def qtype(self):
        """ Return the query type """
        return self._qtype

    def answer(self):
        """ Return the query answer """
        return self._answer

    def key(self):
        """ Return the query key """
        return self._key

    def token_list(self):
        """ Return the token list for the query """
        return self._toklist

    def error(self):
        """ Return the query error, if any """
        return self._error
Beispiel #17
0
class Query:
    """ A Query is initialized by parsing a query string using QueryRoot as the
        grammar root nonterminal. The Query can then be executed by processing
        the best parse tree using the nonterminal handlers given above, returning a
        result object if successful. """

    _parser = None
    _processors = []
    _help_texts = dict()

    def __init__(self, session, query, voice, auto_uppercase, location,
                 client_id):
        q = self._preprocess_query_string(query)
        self._session = session
        self._query = q or ""
        self._location = location
        # Prepare a "beautified query" string that can be
        # shown in a client user interface. By default, this
        # starts with an uppercase letter and ends with a
        # question mark, but this can be modified during the
        # processing of the query.
        self.set_beautified_query(beautify_query(q))
        self._voice = voice
        self._auto_uppercase = auto_uppercase
        self._error = None
        # A detailed answer, which can be a list or a dict
        self._response = None
        # A single "best" displayable text answer
        self._answer = None
        # A version of self._answer that can be
        # fed to a voice synthesizer
        self._voice_answer = None
        self._tree = None
        self._qtype = None
        self._key = None
        self._toklist = None
        # Expiration timestamp, if any
        self._expires = None
        # URL assocated with query, can be set by query response handler
        # and subsequently provided to the remote client
        self._url = None
        # Client id, if known
        self._client_id = client_id
        # Source of answer to query
        self._source = None
        # Query context, which is None until fetched via self.fetch_context()
        # This should be a dict that can be represented in JSON
        self._context = None

    def _preprocess_query_string(self, q):
        """ Preprocess the query string prior to further analysis """
        if not q:
            return q
        qf = re.sub(_IGNORED_PREFIX_RE, "", q, flags=re.IGNORECASE)
        # If stripping the prefixes results in an empty query,
        # just return original query string unmodified.
        return qf or q

    @classmethod
    def init_class(cls):
        """ Initialize singleton data, i.e. the list of query
            processor modules and the query parser instance """
        procs = []
        # Load the query processor modules found in the
        # queries directory
        modnames = modules_in_dir("queries")
        for modname in sorted(modnames):
            try:
                m = importlib.import_module(modname)
                procs.append(m)
            except ImportError as e:
                logging.error(
                    "Error importing query processor module {0}: {1}".format(
                        modname, e))
        cls._processors = procs

        # Obtain query grammar fragments from those processors
        # that handle parse trees. Also collect topic lemmas that
        # can be used to provide context-sensitive help texts
        # when queries cannot be parsed.
        grammar_fragments = []
        help_texts = defaultdict(list)
        for processor in procs:
            handle_tree = getattr(processor, "HANDLE_TREE", None)
            if handle_tree:
                # Check whether this processor supplies
                # a query grammar fragment
                fragment = getattr(processor, "GRAMMAR", None)
                if fragment and isinstance(fragment, str):
                    # Looks legit: add it to our list
                    grammar_fragments.append(fragment)
            # Collect topic lemmas and corresponding help text functions
            topic_lemmas = getattr(processor, "TOPIC_LEMMAS", None)
            if topic_lemmas:
                help_text_func = getattr(processor, "help_text", None)
                # If topic lemmas are given, a help_text function
                # should also be present
                assert help_text_func is not None
                if help_text_func is not None:
                    for lemma in topic_lemmas:
                        help_texts[lemma].append(help_text_func)
        cls._help_texts = help_texts

        # Coalesce the grammar additions from the fragments
        grammar_additions = "\n".join(grammar_fragments)
        # Initialize a singleton parser instance for queries,
        # with the nonterminal 'QueryRoot' as the grammar root
        cls._parser = QueryParser(grammar_additions)

    @staticmethod
    def _parse(toklist):
        """ Parse a token list as a query """
        bp = Query._parser
        num_sent = 0
        num_parsed_sent = 0
        rdc = Reducer(bp.grammar)
        trees = dict()
        sent = []

        for t in toklist:
            if t[0] == TOK.S_BEGIN:
                sent = []
            elif t[0] == TOK.S_END:
                slen = len(sent)
                if not slen:
                    continue
                num_sent += 1
                # Parse the accumulated sentence
                num = 0
                try:
                    # Parse the sentence
                    forest = bp.go(sent)
                    if forest is not None:
                        num = Fast_Parser.num_combinations(forest)
                        if num > 1:
                            # Reduce the resulting forest
                            forest = rdc.go(forest)
                except ParseError:
                    forest = None
                if num > 0:
                    num_parsed_sent += 1
                    # Obtain a text representation of the parse tree
                    trees[num_sent] = ParseForestDumper.dump_forest(forest)

            elif t[0] == TOK.P_BEGIN:
                pass
            elif t[0] == TOK.P_END:
                pass
            else:
                sent.append(t)

        result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent)
        return result, trees

    def parse(self, result):
        """ Parse the query from its string, returning True if valid """
        self._tree = None  # Erase previous tree, if any
        self._error = None  # Erase previous error, if any
        self._qtype = None  # Erase previous query type, if any
        self._key = None
        self._toklist = None

        q = self._query.strip()
        if not q:
            self.set_error("E_EMPTY_QUERY")
            return False

        toklist = tokenize(q,
                           auto_uppercase=self._auto_uppercase and q.islower())
        toklist = list(toklist)
        # The following seems not to be needed and may complicate things
        # toklist = list(recognize_entities(toklist, enclosing_session=self._session))

        actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt))
        if actual_q:
            actual_q = actual_q[0].upper() + actual_q[1:]
            if not any(actual_q.endswith(s) for s in ("?", ".", "!")):
                actual_q += "?"

        # Update the beautified query string, as the actual_q string
        # probably has more correct capitalization
        self.set_beautified_query(actual_q)

        if Settings.DEBUG:
            # Log the query string as seen by the parser
            print("Query is: '{0}'".format(actual_q))

        parse_result, trees = Query._parse(toklist)

        if not trees:
            # No parse at all
            self.set_error("E_NO_PARSE_TREES")
            return False

        result.update(parse_result)

        if result["num_sent"] != 1:
            # Queries must be one sentence
            self.set_error("E_MULTIPLE_SENTENCES")
            return False
        if result["num_parsed_sent"] != 1:
            # Unable to parse the single sentence
            self.set_error("E_NO_PARSE")
            return False
        if 1 not in trees:
            # No sentence number 1
            self.set_error("E_NO_FIRST_SENTENCE")
            return False
        # Looks good
        # Store the resulting parsed query as a tree
        tree_string = "S1\n" + trees[1]
        if Settings.DEBUG:
            print(tree_string)
        self._tree = Tree()
        self._tree.load(tree_string)
        # Store the token list
        self._toklist = toklist
        return True

    def execute_from_plain_text(self):
        """ Attempt to execute a plain text query, without having to parse it """
        if not self._query:
            return False
        for processor in self._processors:
            handle_plain_text = getattr(processor, "handle_plain_text", None)
            if handle_plain_text is not None:
                # This processor has a handle_plain_text function:
                # call it
                if handle_plain_text(self):
                    # Successfully handled: we're done
                    return True
        return False

    def execute_from_tree(self):
        """ Execute the query contained in the previously parsed tree;
            return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False
        for processor in self._processors:
            self._error = None
            self._qtype = None
            # If a processor defines HANDLE_TREE and sets it to
            # a truthy value, it wants to handle parse trees
            handle_tree = getattr(processor, "HANDLE_TREE", None)
            if handle_tree:
                # Process the tree, which has only one sentence
                self._tree.process(self._session, processor, query=self)
                if self._answer and self._error is None:
                    # The processor successfully answered the query
                    return True
        # No processor was able to answer the query
        return False

    def last_answer(self, *, within_minutes=5):
        """ Return the last answer given to this client, by default
            within the last 5 minutes (0=forever) """
        if not self._client_id:
            # Can't find the last answer if no client_id given
            return None
        # Find the newest non-error, no-repeat query result for this client
        q = (self._session.query(QueryRow.answer, QueryRow.voice).filter(
            QueryRow.client_id == self._client_id).filter(
                QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
        if within_minutes > 0:
            # Apply a timestamp filter
            since = datetime.utcnow() - timedelta(minutes=within_minutes)
            q = q.filter(QueryRow.timestamp >= since)
        # Sort to get the newest query that fulfills the criteria
        last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
        return None if last is None else tuple(last)

    def fetch_context(self, *, within_minutes=10):
        """ Return the context from the last answer given to this client,
            by default within the last 10 minutes (0=forever) """
        if not self._client_id:
            # Can't find the last answer if no client_id given
            return None
        # Find the newest non-error, no-repeat query result for this client
        q = (self._session.query(QueryRow.context).filter(
            QueryRow.client_id == self._client_id).filter(
                QueryRow.qtype != "Repeat").filter(QueryRow.error == None))
        if within_minutes > 0:
            # Apply a timestamp filter
            since = datetime.utcnow() - timedelta(minutes=within_minutes)
            q = q.filter(QueryRow.timestamp >= since)
        # Sort to get the newest query that fulfills the criteria
        ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none()
        if ctx is None:
            return None
        # This function normally returns a dict that has been decoded from JSON
        return None if ctx is None else ctx[0]

    @property
    def query(self):
        return self._query

    @property
    def query_lower(self):
        return self._query.lower()

    @property
    def beautified_query(self):
        """ Return the query string that will be reflected back to the client """
        return self._beautified_query

    def set_beautified_query(self, q):
        """ Set the query string that will be reflected back to the client """
        self._beautified_query = (
            q.replace("embla", "Embla").replace("miðeind", "Miðeind").replace(
                "Guðni Th ", "Guðni Th. ")  # By presidential request :)
        )

    def lowercase_beautified_query(self):
        """ If we know that no uppercase words occur in the query,
            except the initial capital, this function can be called
            to adjust the beautified query string accordingly. """
        self.set_beautified_query(self._beautified_query.capitalize())

    def query_is_command(self):
        """ Called from a query processor if the query is a command, not a question """
        # Put a period at the end of the beautified query text
        # instead of a question mark
        if self._beautified_query.endswith("?"):
            self._beautified_query = self._beautified_query[:-1] + "."

    @property
    def expires(self):
        """ Expiration time stamp for this query answer, if any """
        return self._expires

    def set_expires(self, ts):
        self._expires = ts

    @property
    def url(self):
        """ URL answer associated with this query """
        return self._url

    def set_url(self, u):
        self._url = u

    @property
    def source(self):
        """ Source of answer to this query """
        return self._source

    def set_source(self, s):
        self._source = s

    @property
    def location(self):
        return self._location

    @property
    def token_list(self):
        return self._toklist

    def set_qtype(self, qtype):
        """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """
        self._qtype = qtype

    def set_answer(self, response, answer, voice_answer=None):
        """ Set the answer to the query """
        # Detailed response (this is usually a dict)
        self._response = response
        # Single best answer, as a displayable string
        self._answer = answer
        # A voice version of the single best answer
        self._voice_answer = voice_answer

    def set_key(self, key):
        """ Set the query key, i.e. the term or string used to execute the query """
        # This is for instance a person name in nominative case
        self._key = key

    def set_error(self, error):
        """ Set an error result """
        self._error = error

    def qtype(self):
        """ Return the query type """
        return self._qtype

    @property
    def is_voice(self):
        """ Return True if this is a voice query """
        return self._voice

    def response(self):
        """ Return the detailed query answer """
        return self._response

    def answer(self):
        """ Return the 'single best' displayable query answer """
        return self._answer

    def voice_answer(self):
        """ Return a voice version of the 'single best' answer, if any """
        return self._voice_answer

    def key(self):
        """ Return the query key """
        return self._key

    def error(self):
        """ Return the query error, if any """
        return self._error

    def set_context(self, ctx):
        """ Set a query context that will be stored and made available
            to the next query from the same client """
        self._context = ctx

    @property
    def context(self):
        """ Return the context that has been set by self.set_context() """
        return self._context

    @classmethod
    def try_to_help(cls, query, result):
        """ Attempt to help the user in the case of a failed query,
            based on lemmas in the query string """
        # Collect a set of lemmas that occur in the query string
        lemmas = set()
        with BIN_Db.get_db() as db:
            for token in query.lower().split():
                if token.isalpha():
                    m = db.meanings(token)
                    if not m:
                        # Try an uppercase version, just in case (pun intended)
                        m = db.meanings(token.capitalize())
                    if m:
                        lemmas |= set(mm.stofn.lower() for mm in m)
        # Collect a list of potential help text functions from the query modules
        help_text_funcs = []
        for lemma in lemmas:
            help_text_funcs.extend([
                (lemma, help_text_func)
                for help_text_func in cls._help_texts.get(lemma, [])
            ])
        if help_text_funcs:
            # Found at least one help text func matching a lemma in the query
            # Select a function at random and invoke it with the matched
            # lemma as a parameter
            lemma, help_text_func = random.choice(help_text_funcs)
            result["answer"] = result["voice"] = help_text_func(lemma)
            result["valid"] = True

    def execute(self):
        """ Check whether the parse tree is describes a query, and if so,
            execute the query, store the query answer in the result dictionary
            and return True """
        if Query._parser is None:
            Query.init_class()
        # By default, the result object contains the 'raw' query
        # string (the one returned from the speech-to-text processor)
        # as well as the beautified version of that string - which
        # usually starts with an uppercase letter and has a trailing
        # question mark (or other ending punctuation).
        result = dict(q_raw=self.query, q=self.beautified_query)
        # First, try to handle this from plain text, without parsing:
        # shortcut to a successful, plain response
        if not self.execute_from_plain_text():
            if not self.parse(result):
                # Unable to parse the query
                if Settings.DEBUG:
                    print("Unable to parse query, error {0}".format(
                        self.error()))
                result["error"] = self.error()
                result["valid"] = False
                return result
            if not self.execute_from_tree():
                # This is a query, but its execution failed for some reason:
                # return the error
                # if Settings.DEBUG:
                #     print("Unable to execute query, error {0}".format(q.error()))
                result["error"] = self.error() or "E_UNABLE_TO_EXECUTE_QUERY"
                result["valid"] = True
                return result
        # Successful query: return the answer in response
        if self._answer:
            result["answer"] = self._answer
        if self._voice and self._voice_answer:
            # This is a voice query and we have a voice answer to it
            result["voice"] = self._voice_answer
        if self._voice:
            # Optimize the response to voice queries:
            # we don't need detailed information about alternative
            # answers or their sources
            result["response"] = dict(answer=self._answer or "")
        else:
            # Return a detailed response if not a voice query
            result["response"] = self._response
        # Re-assign the beautified query string, in case the query processor modified it
        result["q"] = self.beautified_query
        # ...and the query type, as a string ('Person', 'Entity', 'Title' etc.)
        result["qtype"] = qt = self.qtype()
        # ...and the key used to retrieve the answer, if any
        result["key"] = self.key()
        # ...and a URL, if any has been set by the query processor
        if self.url:
            result["open_url"] = self.url
        # .. and the source, if set by query processor
        if self.source:
            result["source"] = self.source
        if not self._voice and qt == "Person":
            # For a person query, add an image (if available)
            img = get_image_url(self.key(), enclosing_session=self._session)
            if img is not None:
                result["image"] = dict(
                    src=img.src,
                    width=img.width,
                    height=img.height,
                    link=img.link,
                    origin=img.origin,
                    name=img.name,
                )
        result["valid"] = True
        if Settings.DEBUG:
            # Dump query results to the console
            def converter(o):
                """ Ensure that datetime is output in ISO format to JSON """
                if isinstance(o, datetime):
                    return o.isoformat()[0:16]
                return None

            print("{0}".format(
                json.dumps(result,
                           indent=3,
                           ensure_ascii=False,
                           default=converter)))
        return result