def make_tree(text: str) -> Tree: toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) return tree
def all_matches( cls, criteria: Mapping[str, Any], pattern: str, enclosing_session: Optional[Session] = None, ) -> Iterator[Tuple["Article", int, SimpleTree]]: """ Generator of SimpleTree objects (see matcher.py) from articles matching the given criteria and the pattern """ with SessionContext(commit=True, read_only=True, session=enclosing_session) as session: # t0 = time.time() mcnt = acnt = tcnt = 0 # print("Starting article loop") for a in cls.articles(criteria, enclosing_session=session): if a.tree is None: continue acnt += 1 tree = Tree(url=a.url or "", authority=a.authority) tree.load(a.tree) for ix, simple_tree in tree.simple_trees(): tcnt += 1 for match in simple_tree.all_matches(pattern): yield (a, ix, match) mcnt += 1
def gen_simple_trees(criteria): """ Generate simplified parse trees from articles matching the criteria """ for a in Article.articles(criteria): # Skip articles from certain websites if ( not a.root_domain or "raduneyti" in a.root_domain or "lemurinn" in a.root_domain ): continue # Load tree from article try: tree = Tree(url=a.url, authority=a.authority) tree.load(a.tree) except Exception as e: print("Exception loading tree in {0}: {1}".format(a.url, e)) continue # Yield simple trees for ix, stree in tree.simple_trees(): text = stree.text tokens = text.split() if len(tokens) >= MIN_SENT_LENGTH: wordset = set([t.lower() for t in tokens]) # Only return sentences without our bag of English words if not (wordset & ENGLISH_WORDS): yield stree, tree.score(ix), tree.length(ix), a.uuid, a.url, ix
def go_single(self, url: str) -> None: """ Single article processor that will be called by a process within a multiprocessing pool """ assert self._db is not None print("Processing article {0}".format(url)) sys.stdout.flush() # If first article within a new process, import the processor modules if self.pmodules is None: self.pmodules = [ importlib.import_module(modname) for modname in self.processors ] # Load the article with closing(self._db.session) as session: try: article = session.query(Article).filter_by( url=url).one_or_none() if article is None: print("Article not found in scraper database") else: if article.tree and article.tokens: tree = Tree(url, article.authority) tree.load(article.tree) token_container = TokenContainer( article.tokens, url, article.authority) # Run all processors in turn for p in self.pmodules: ptype = getattr(p, "PROCESSOR_TYPE") # type: str if ptype == "tree": tree.process(session, p) elif ptype == "token": token_container.process(session, p) else: assert False, ( "Unknown processor type '{0}'; should be 'tree' or 'token'" .format(ptype)) # Mark the article as being processed article.processed = datetime.utcnow() # So far, so good: commit to the database session.commit() except Exception as e: # If an exception occurred, roll back the transaction session.rollback() print( "Exception in article {0}, transaction rolled back\nException: {1}" .format(url, e)) raise sys.stdout.flush()
def gen_simple_trees(criteria, stats): """ Generate simplified parse trees from articles matching the criteria """ for a in Article.articles(criteria): if not a.root_domain or "raduneyti" in a.root_domain: # Skip ministry websites due to amount of chaff found there continue tree = Tree(url = a.url, authority = a.authority) # Note the parse timestamp stats["parsed"] = a.parsed tree.load(a.tree) for ix, stree in tree.simple_trees(): yield stree, tree.score(ix), tree.length(ix)
def gen_simple_trees(criteria, stats): """ Generate simplified parse trees from articles matching the criteria """ for a in Article.articles(criteria): if not a.root_domain or "raduneyti" in a.root_domain: # Skip ministry websites due to amount of chaff found there continue tree = Tree(url=a.url, authority=a.authority) # Note the parse timestamp stats["parsed"] = a.parsed tree.load(a.tree) for ix, stree in tree.simple_trees(): yield stree, tree.score(ix), tree.length(ix)
def go_single(self, url): """ Single article processor that will be called by a process within a multiprocessing pool """ print("Processing article {0}".format(url)) sys.stdout.flush() # If first article within a new process, import the processor modules if self.pmodules is None: self.pmodules = [ importlib.import_module(modname) for modname in self.processors ] # Load the article with closing(self._db.session) as session: try: article = session.query(Article).filter_by(url=url).one_or_none() if article is None: print("Article not found in scraper database") else: if article.tree and article.tokens: tree = Tree(url, article.authority) tree.load(article.tree) token_container = TokenContainer(article.tokens, url) # Run all processors in turn for p in self.pmodules: if p.PROCESSOR_TYPE == "tree": tree.process(session, p) elif p.PROCESSOR_TYPE == "token": token_container.process(session, p) # Mark the article as being processed article.processed = datetime.utcnow() # So far, so good: commit to the database session.commit() except Exception as e: # If an exception occurred, roll back the transaction session.rollback() print( "Exception in article {0}, transaction rolled back\nException: {1}".format( url, e ) ) raise sys.stdout.flush()
def go_single(self, url): """ Single article processor that will be called by a process within a multiprocessing pool """ print("Processing article {0}".format(url)) sys.stdout.flush() # If first article within a new process, import the processor modules if self.pmodules is None: self.pmodules = [ importlib.import_module(modname) for modname in self.processors ] # Load the article with closing(self._db.session) as session: try: article = session.query(Article).filter_by( url=url).one_or_none() if article is None: print("Article not found in scraper database") else: if article.tree: tree = Tree(url, article.authority) # print("Tree:\n{0}\n".format(article.tree)) tree.load(article.tree) # Run all processors in turn for p in self.pmodules: tree.process(session, p) # Mark the article as being processed article.processed = datetime.utcnow() # So far, so good: commit to the database session.commit() except Exception as e: # If an exception occurred, roll back the transaction session.rollback() print( "Exception in article {0}, transaction rolled back\nException: {1}" .format(url, e)) raise sys.stdout.flush()
class ApiFs(LoggingMixIn, Operations): 'Api filesystem for http://korchasa.host' def __init__(self, loader): self.tree = Tree() self.loader = loader self.fd = 0 def create(self, path, mode=0o644): self.files[path] = self._file(mode) self.fd += 1 return self.fd getxattr = None def getattr(self, path, fh=None): if len(path) > 1 and path[1] == '.': raise FuseOSError(ENOENT) node = self.tree.load(path, self.loader) if node: return self._fs_node(node) else: raise FuseOSError(ENOENT) def open(self, path, flags): self.fd += 1 return self.fd def read(self, path, size, offset, fh): return self.tree.node(path)['data'][offset:offset + size] def readdir(self, path, fh): node = self.tree.nearest(path) if not node.get('loaded'): self.loader(path, self.tree) children = ['.', '..'] + [url.lstrip('/') for url in list(self.tree.children(path).keys()) if url != '/'] return children def statfs(self, path): return dict(f_bsize=512, f_blocks=4096, f_bavail=2048) def _fs_node(self, tree_node): if tree_node.get('dir'): return dict( st_mode=(S_IFDIR | 0o755), st_ctime=time(), st_mtime=time(), st_atime=time(), st_nlink=2 ) else: return dict( st_mode=(S_IFREG | 0o644), st_nlink=1, st_size=len(tree_node.get('data')), st_ctime=time(), st_mtime=time(), st_atime=time() )
def all_matches(cls, criteria, pattern, enclosing_session=None): """ Generator of SimpleTree objects (see matcher.py) from articles matching the given criteria and the pattern """ with SessionContext(commit=True, read_only=True, session=enclosing_session) as session: # t0 = time.time() mcnt = acnt = tcnt = 0 # print("Starting article loop") for a in cls.articles(criteria, enclosing_session=session): acnt += 1 tree = Tree(url=a.url, authority=a.authority) tree.load(a.tree) for ix, simple_tree in tree.simple_trees(): tcnt += 1 for match in simple_tree.all_matches(pattern): yield (a, ix, match) mcnt += 1
def _make_tree(text: str) -> Tree: """Tokenize and parse text, create tree representation string from all the parse trees, return Tree object and token JSON.""" toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) pgs = [] # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): pgs.append([]) for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse: " + sent.text # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree assert sent.tree is not None tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree] ) pgs[-1].append(token_dicts) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tokens_json = json.dumps(pgs, separators=(",", ":"), ensure_ascii=False) tree = Tree() tree.load(tree_string) return tree, tokens_json
class Query: """ A Query is initialized by parsing a query string using QueryRoot as the grammar root nonterminal. The Query can then be executed by processing the best parse tree using the nonterminal handlers given above, returning a result object if successful. """ def __init__(self, session): self._session = session self._error = None self._answer = None self._tree = None self._qtype = None self._key = None @staticmethod def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) #ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees def parse(self, toklist, result): """ Parse the token list as a query, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] #print("Query tree:\n{0}".format(tree_string)) self._tree = Tree() self._tree.load(tree_string) return True def execute(self): """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False self._error = None self._qtype = None with closing(BIN_Db.get_db()) as bin_db: # Process the tree, which has only one sentence self._tree.process(self._session, _THIS_MODULE, bin_db, query=self) return self._error is None def set_qtype(self, qtype): """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """ self._qtype = qtype def set_answer(self, answer): """ Set the answer to the query """ self._answer = answer def set_key(self, key): """ Set the query key, i.e. the term or string used to execute the query """ # This is for instance a person name in nominative case self._key = key def set_error(self, error): """ Set an error result """ self._error = error def qtype(self): """ Return the query type """ return self._qtype def answer(self): """ Return the query answer """ return self._answer def key(self): """ Return the query key """ return self._key def error(self): """ Return the query error, if any """ return self._error
def test_entities(): text = """ Ég skipti við flugfélagið AirBerlin áður en það varð gjaldþrota. Danska byggingavörukeðjan Bygma hefur keypt íslenska verslunarfyrirtækið Húsasmiðjuna. Bandarísku fjárfestingarsjóðirnir Attestor Capital og Goldman Sachs eru hluthafar í Arion banka. Fosshótel, stór hótelkeðja, var rekin með tapi í fyrra. Lax, stór fiskur af ætt laxfiska, er veiddur í íslenskum ám. Silfraður lax, fiskur af ætt laxfiska, er veiddur í íslenskum ám. Ég ræddi við fulltrúa Norðuráls (álverksmiðjunnar í Hvalfirði) í gær. Ég ræddi við fulltrúa Norðuráls (í Hvalfirði) í gær. Primera Air var íslenskt flugfélag. Ef veðrið er gott þá fullyrði ég að Primera Air sé danskt flugfélag. Villeneuve-Loubet er franskt þorp. Það er hægt að fá bragðgóðan ís í ísbúðinni Valdísi úti á Granda. Í miðbæ Reykjavíkur er herrafataverslunin Geysir. Mér er sagt að Geysir sé hættur að gjósa. Geysir er hættur að gjósa. Geysir er gamall goshver. Fyrirtækið Apple-búðin selur Apple Mac tölvur. Fyrirtækið Origo selur IBM tölvur. Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf. """ toklist = tokenize(text) fp = Fast_Parser(verbose=False) ip = IncrementalParser(fp, toklist, verbose=False) # Dict of parse trees in string dump format, # stored by sentence index (1-based) trees = OrderedDict() num_sent = 0 for p in ip.paragraphs(): for sent in p.sentences(): num_sent += 1 num_tokens = len(sent) assert sent.parse(), "Sentence does not parse" # Obtain a text representation of the parse tree token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree) # Create a verbose text representation of # the highest scoring parse tree tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts) # Add information about the sentence tree's score # and the number of tokens trees[num_sent] = "\n".join( ["C{0}".format(sent.score), "L{0}".format(num_tokens), tree]) # Create a tree representation string out of # all the accumulated parse trees tree_string = "".join("S{0}\n{1}\n".format(key, val) for key, val in trees.items()) tree = Tree() tree.load(tree_string) session = SessionShim() tree.process(session, entities) session.check(("Bygma", "er", "dönsk byggingavörukeðja")) session.check(("Húsasmiðjan", "er", "íslenskt verslunarfyrirtæki")) session.check(("Goldman Sachs", "er", "bandarískur fjárfestingarsjóður")) session.check( ("Attestor Capital", "er", "bandarískur fjárfestingarsjóður")) session.check(("Primera Air", "var", "íslenskt flugfélag")) session.check(("Villeneuve-Loubet", "er", "franskt þorp")) session.check(("Valdís", "er", "ísbúð")) session.check(("Fosshótel", "var", "rekin með tapi")) session.check(("Fosshótel", "er", "stór hótelkeðja")) session.check(("Norðurál", "er", "álverksmiðjan í Hvalfirði")) session.check(("Lax", "er", "stór fiskur af ætt laxfiska")) session.check(("Geysir", "er", "gamall goshver")) session.check(("Eimskipafélag Íslands hf", "er", "skipafélag")) session.check(("Origo", "er", "fyrirtæki")) session.check(("AirBerlin", "er", "flugfélag")) assert session.is_empty()
class Query: """ A Query is initialized by parsing a query string using QueryRoot as the grammar root nonterminal. The Query can then be executed by processing the best parse tree using the nonterminal handlers given above, returning a result object if successful. """ # Processors that handle parse trees _tree_processors: List[ModuleType] = [] # Handler functions within processors that handle plain text _text_processors: List[Callable[["Query"], bool]] = [] # Singleton instance of the query parser _parser: Optional[QueryParser] = None # Help texts associated with lemmas _help_texts: Dict[str, List[Callable]] = dict() def __init__( self, session, # SQLAlchemy session query: str, voice: bool, auto_uppercase: bool, location: Optional[LocationType], client_id: Optional[str], client_type: Optional[str], ) -> None: self._query = q = self._preprocess_query_string(query) self._session = session self._location = location # Prepare a "beautified query" string that can be # shown in a client user interface. By default, this # starts with an uppercase letter and ends with a # question mark, but this can be modified during the # processing of the query. self.set_beautified_query(beautify_query(q)) self._voice = voice self._auto_uppercase = auto_uppercase self._error: Optional[str] = None # A detailed answer, which can be a list or a dict self._response: Optional[ResponseMapping] = None # A single "best" displayable text answer self._answer: Optional[str] = None # A version of self._answer that can be # fed to a voice synthesizer self._voice_answer: Optional[str] = None self._tree: Optional[Tree] = None self._qtype: Optional[str] = None self._key: Optional[str] = None self._toklist: Optional[List[Tok]] = None # Expiration timestamp, if any self._expires: Optional[datetime] = None # URL assocated with query, can be set by query response handler # and subsequently provided to the remote client self._url: Optional[str] = None # Command returned by query self._command: Optional[str] = None # Client id, if known self._client_id = client_id # Client type, if known self._client_type = client_type # Source of answer to query self._source: Optional[str] = None # Query context, which is None until fetched via self.fetch_context() # This should be a dict that can be represented in JSON self._context: Optional[ContextDict] = None def _preprocess_query_string(self, q: str) -> str: """ Preprocess the query string prior to further analysis """ if not q: return q qf = re.sub(_IGNORED_PREFIX_RE, "", q, flags=re.IGNORECASE) # Remove " embla" suffix, if present qf = re.sub(r"\s+embla$", "", qf, flags=re.IGNORECASE) # Fix common Google ASR mistake: 'hæ embla' is returned as 'bæjarblað' if not qf and q == "bæjarblað": q = "hæ embla" # If stripping the prefixes results in an empty query, # just return original query string unmodified. return qf or q @classmethod def init_class(cls) -> None: """ Initialize singleton data, i.e. the list of query processor modules and the query parser instance """ all_procs = [] tree_procs = [] text_procs = [] # Load the query processor modules found in the # queries directory. The modules can be tree and/or text processors, # and we sort them into two lists, accordingly. modnames = modules_in_dir("queries") for modname in sorted(modnames): try: m = importlib.import_module(modname) all_procs.append(m) if getattr(m, "HANDLE_TREE", False): # This is a tree processor tree_procs.append(m) handle_plain_text = getattr(m, "handle_plain_text", None) if handle_plain_text is not None: # This is a text processor: # store a reference to its handler function text_procs.append(handle_plain_text) except ImportError as e: logging.error( "Error importing query processor module {0}: {1}".format( modname, e)) cls._tree_processors = tree_procs cls._text_processors = text_procs # Obtain query grammar fragments from the tree processors grammar_fragments = [] for processor in tree_procs: # Check whether this tree processor supplies a query grammar fragment fragment = getattr(processor, "GRAMMAR", None) if fragment and isinstance(fragment, str): # Looks legit: add it to our list grammar_fragments.append(fragment) # Collect topic lemmas that can be used to provide # context-sensitive help texts when queries cannot be parsed help_texts = defaultdict(list) for processor in all_procs: # Collect topic lemmas and corresponding help text functions topic_lemmas = getattr(processor, "TOPIC_LEMMAS", None) if topic_lemmas: help_text_func = getattr(processor, "help_text", None) # If topic lemmas are given, a help_text function # should also be present assert help_text_func is not None if help_text_func is not None: for lemma in topic_lemmas: help_texts[lemma].append(help_text_func) cls._help_texts = help_texts # Coalesce the grammar additions from the fragments grammar_additions = "\n".join(grammar_fragments) # Initialize a singleton parser instance for queries, # with the nonterminal 'QueryRoot' as the grammar root cls._parser = QueryParser(grammar_additions) @staticmethod def _parse(toklist: Iterable[Tok]) -> Tuple[ResponseDict, Dict[int, str]]: """ Parse a token list as a query """ bp = Query._parser assert bp is not None num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees: Dict[int, str] = dict() sent: List[Tok] = [] for t in toklist: if t[0] == TOK.S_BEGIN: sent = [] elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError: forest = None num = 0 if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result: ResponseDict = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees @staticmethod def _query_string_from_toklist(toklist: Iterable[Tok]) -> str: """ Re-create a query string from an auto-capitalized token list """ actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: # Fix stuff that the auto-capitalization tends to get wrong, # such as 'í Dag' for wrong, correct in _CAPITALIZATION_REPLACEMENTS: actual_q = actual_q.replace(wrong, correct) # Capitalize the first letter of the query actual_q = actual_q[0].upper() + actual_q[1:] # Terminate the query with a question mark, # if not otherwise terminated if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" return actual_q def parse(self, result: ResponseDict) -> bool: """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query if not q: self.set_error("E_EMPTY_QUERY") return False # Tokenize and auto-capitalize the query string toklist = list( tokenize(q, auto_uppercase=self._auto_uppercase and q.islower())) actual_q = self._query_string_from_toklist(toklist) # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) # TODO: We might want to re-tokenize the actual_q string with # auto_uppercase=False, since we may have fixed capitalization # errors in _query_string_from_toklist() if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] if Settings.DEBUG: print(tree_string) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True def execute_from_plain_text(self) -> bool: """ Attempt to execute a plain text query, without having to parse it """ if not self._query: return False # Call the handle_plain_text() function in each text processor, # until we find one that returns True, or return False otherwise return any( handle_plain_text(self) for handle_plain_text in self._text_processors) def execute_from_tree(self) -> bool: """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False for processor in self._tree_processors: self._error = None self._qtype = None # Process the tree, which has only one sentence try: self._tree.process(self._session, processor, query=self) if self._answer and self._error is None: # The processor successfully answered the query return True except Exception as e: logging.error( f"Exception in execute_from_tree('{processor.__name__}') " f"for query '{self._query}': {e}") # No processor was able to answer the query return False def last_answer(self, *, within_minutes: int = 5) -> Optional[Tuple[str, str]]: """ Return the last answer given to this client, by default within the last 5 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.answer, QueryRow.voice).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() return None if last is None else (last[0], last[1]) def fetch_context(self, *, within_minutes: int = 10) -> Optional[ContextDict]: """ Return the context from the last answer given to this client, by default within the last 10 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.context).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() # This function normally returns a dict that has been decoded from JSON return None if ctx is None else ctx[0] @property def query(self) -> str: """ The query text, in its original form """ return self._query @property def query_lower(self) -> str: """ The query text, all lower case """ return self._query.lower() @property def beautified_query(self) -> str: """ Return the query string that will be reflected back to the client """ return self._beautified_query def set_beautified_query(self, q: str) -> None: """ Set the query string that will be reflected back to the client """ self._beautified_query = ( q.replace("embla", "Embla").replace("miðeind", "Miðeind").replace( "Guðni Th ", "Guðni Th. ") # By presidential request :) ) def lowercase_beautified_query(self) -> None: """ If we know that no uppercase words occur in the query, except the initial capital, this function can be called to adjust the beautified query string accordingly. """ self.set_beautified_query(self._beautified_query.capitalize()) def query_is_command(self) -> None: """ Called from a query processor if the query is a command, not a question """ # Put a period at the end of the beautified query text # instead of a question mark if self._beautified_query.endswith("?"): self._beautified_query = self._beautified_query[:-1] + "." @property def expires(self) -> Optional[datetime]: """ Expiration time stamp for this query answer, if any """ return self._expires def set_expires(self, ts: datetime) -> None: """ Set an expiration time stamp for this query answer """ self._expires = ts @property def url(self) -> Optional[str]: """ URL answer associated with this query """ return self._url def set_url(self, u: str) -> None: """ Set the URL answer associated with this query """ self._url = u @property def command(self) -> Optional[str]: """ JavaScript command associated with this query """ return self._command def set_command(self, c: str) -> None: """ Set the JavaScript command associated with this query """ self._command = c @property def source(self) -> Optional[str]: """ Return the source of the answer to this query """ return self._source def set_source(self, s: str) -> None: """ Set the source for the answer to this query """ self._source = s @property def location(self) -> Optional[LocationType]: """ The client location, if known, as a (lat, lon) tuple """ return self._location @property def token_list(self) -> Optional[List[Tok]]: """ The original token list for the query """ return self._toklist def qtype(self) -> Optional[str]: """ Return the query type """ return self._qtype def set_qtype(self, qtype: str) -> None: """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """ self._qtype = qtype def set_answer(self, response: ResponseMapping, answer: str, voice_answer: Optional[str] = None) -> None: """ Set the answer to the query """ # Detailed response (this is usually a dict) self._response = response # Single best answer, as a displayable string self._answer = answer # A voice version of the single best answer self._voice_answer = voice_answer def set_key(self, key: str) -> None: """ Set the query key, i.e. the term or string used to execute the query """ # This is for instance a person name in nominative case self._key = key def set_error(self, error: str) -> None: """ Set an error result """ self._error = error @property def is_voice(self) -> bool: """ Return True if this is a voice query """ return self._voice @property def client_id(self) -> Optional[str]: return self._client_id @property def client_type(self) -> Optional[str]: """ Return client type string, e.g. "ios", "android", "www", etc. """ return self._client_type def response(self) -> Optional[ResponseMapping]: """ Return the detailed query answer """ return self._response def answer(self) -> Optional[str]: """ Return the 'single best' displayable query answer """ return self._answer def voice_answer(self) -> Optional[str]: """ Return a voice version of the 'single best' answer, if any """ return self._voice_answer def key(self) -> Optional[str]: """ Return the query key """ return self._key def error(self) -> Optional[str]: """ Return the query error, if any """ return self._error @property def context(self) -> Optional[ContextDict]: """ Return the context that has been set by self.set_context() """ return self._context def set_context(self, ctx: ContextDict) -> None: """ Set a query context that will be stored and made available to the next query from the same client """ self._context = ctx def client_data(self, key: str) -> Optional[ClientDataDict]: """ Fetch client_id-associated data stored in the querydata table """ if not self.client_id: return None with SessionContext(read_only=True) as session: try: client_data = (session.query(QueryData).filter( QueryData.key == key).filter( QueryData.client_id == self.client_id)).one_or_none() return None if client_data is None else client_data.data except Exception as e: logging.error( "Error fetching client '{0}' query data for key '{1}' from db: {2}" .format(self.client_id, key, e)) return None def set_client_data(self, key: str, data: ClientDataDict) -> None: """ Setter for client query data """ if not self.client_id: logging.warning("Couldn't save query data, no client ID") return Query.store_query_data(self.client_id, key, data) @staticmethod def store_query_data(client_id: str, key: str, data: ClientDataDict) -> bool: """ Save client query data in the database, under the given key """ assert client_id and key now = datetime.utcnow() try: with SessionContext(commit=True) as session: row = (session.query(QueryData).filter( QueryData.key == key).filter( QueryData.client_id == client_id)).one_or_none() if row is None: # Not already present: insert row = QueryData( client_id=client_id, key=key, created=now, modified=now, data=data, ) session.add(row) else: # Already present: update row.data = data row.modified = now # The session is auto-committed upon exit from the context manager return True except Exception as e: logging.error("Error storing query data in db: {0}".format(e)) return False @classmethod def try_to_help(cls, query: str, result: ResponseDict) -> None: """ Attempt to help the user in the case of a failed query, based on lemmas in the query string """ # Collect a set of lemmas that occur in the query string lemmas = set() with BIN_Db.get_db() as db: for token in query.lower().split(): if token.isalpha(): m = db.meanings(token) if not m: # Try an uppercase version, just in case (pun intended) m = db.meanings(token.capitalize()) if m: lemmas |= set(mm.stofn.lower().replace("-", "") for mm in m) # Collect a list of potential help text functions from the query modules help_text_funcs = [] for lemma in lemmas: help_text_funcs.extend([ (lemma, help_text_func) for help_text_func in cls._help_texts.get(lemma, []) ]) if help_text_funcs: # Found at least one help text func matching a lemma in the query # Select a function at random and invoke it with the matched # lemma as a parameter lemma, help_text_func = random.choice(help_text_funcs) result["answer"] = result["voice"] = help_text_func(lemma) result["valid"] = True def execute(self) -> ResponseDict: """ Check whether the parse tree is describes a query, and if so, execute the query, store the query answer in the result dictionary and return True """ if Query._parser is None: Query.init_class() # By default, the result object contains the 'raw' query # string (the one returned from the speech-to-text processor) # as well as the beautified version of that string - which # usually starts with an uppercase letter and has a trailing # question mark (or other ending punctuation). result: ResponseDict = dict(q_raw=self.query, q=self.beautified_query) # First, try to handle this from plain text, without parsing: # shortcut to a successful, plain response if not self.execute_from_plain_text(): if not self.parse(result): # Unable to parse the query err = self.error() if err is not None: if Settings.DEBUG: print("Unable to parse query, error {0}".format(err)) result["error"] = err result["valid"] = False return result if not self.execute_from_tree(): # This is a query, but its execution failed for some reason: # return the error # if Settings.DEBUG: # print("Unable to execute query, error {0}".format(q.error())) result["error"] = self.error() or "E_UNABLE_TO_EXECUTE_QUERY" result["valid"] = True return result # Successful query: return the answer in response if self._answer: result["answer"] = self._answer if self._voice and self._voice_answer: # This is a voice query and we have a voice answer to it result["voice"] = self._voice_answer if self._voice: # Optimize the response to voice queries: # we don't need detailed information about alternative # answers or their sources result["response"] = dict(answer=self._answer or "") elif self._response: # Return a detailed response if not a voice query result["response"] = self._response # Re-assign the beautified query string, in case the query processor modified it result["q"] = self.beautified_query # ...and the query type, as a string ('Person', 'Entity', 'Title' etc.) qt = self.qtype() if qt: result["qtype"] = qt # ...and the key used to retrieve the answer, if any key = self.key() if key: result["key"] = key # ...and a URL, if any has been set by the query processor if self.url: result["open_url"] = self.url # ...and a command, if any has been set if self.command: result["command"] = self.command # .. and the source, if set by query processor if self.source: result["source"] = self.source key = self.key() if not self._voice and qt == "Person" and key is not None: # For a person query, add an image (if available) img = get_image_url(key, enclosing_session=self._session) if img is not None: result["image"] = dict( src=img.src, width=img.width, height=img.height, link=img.link, origin=img.origin, name=img.name, ) result["valid"] = True if Settings.DEBUG: # Dump query results to the console def converter(o): """ Ensure that datetime is output in ISO format to JSON """ if isinstance(o, datetime): return o.isoformat()[0:16] return None print("{0}".format( json.dumps(result, indent=3, ensure_ascii=False, default=converter))) return result
def gen_simple_trees(criteria): """ Generate simplified parse trees from articles matching the criteria """ bigset = set() for a in Article.articles(criteria): # Skip articles from certain websites if ( not a.root_domain or "raduneyti" in a.root_domain or "lemurinn" in a.root_domain ): continue # Load tree from article try: tree = Tree(url=a.url, authority=a.authority) tree.load(a.tree) except Exception as e: print("Exception loading tree in {0}: {1}".format(a.url, e)) # Skip it continue # Yield simple trees for each article sentence for ix, stree in tree.simple_trees(): text = stree.text tokens = text.split() # Make sure it has enough tokens if not len(tokens) >= MIN_SENT_LENGTH: continue # Skip sentences containing something in our bag of English words wordset = set([t.lower() for t in tokens]) if wordset & ENGLISH_WORDS: continue # Skip sentences that don't contain enough Icelandic words if unicelandic(stree): continue # Skip uncapitalized sentences if text[0].islower(): continue # Skip sentences containing less than 3 word, entity or person tokens combined if len(list([x for x in stree.leaves() if x.kind in [TOK.WORD, TOK.ENTITY, TOK.PERSON]])) < 3: continue # Skip sentences with only a single NP -- S0→NP if stree.match("S0 > [NP $]"): continue # Skip sentences not containing a VP if not stree.match("S0 >> VP"): continue # Skip sentences not ending in sentence ending punctuation if not text[-1] not in definitions.END_OF_SENTENCE: continue # Skip sentence if we have seen an equivalent sentence before hashnorm = hash(normalize(text)) if hashnorm in bigset: continue else: bigset.add(hashnorm) yield stree, tree.score(ix), tree.length(ix), a.uuid, a.url, ix
class Query: """ A Query is initialized by parsing a query string using QueryRoot as the grammar root nonterminal. The Query can then be executed by processing the best parse tree using the nonterminal handlers given above, returning a result object if successful. """ def __init__(self, session): self._session = session self._error = None self._answer = None self._tree = None self._qtype = None self._key = None self._toklist = None @staticmethod def _parse(toklist): """ Parse a token list as a query """ # Parse with the nonterminal 'QueryRoot' as the grammar root with Fast_Parser(verbose=False, root=_QUERY_ROOT) as bp: sent_begin = 0 num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for ix, t in enumerate(toklist): if t[0] == TOK.S_BEGIN: sent = [] sent_begin = ix elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError as e: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) # ParseForestPrinter.print_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees def parse(self, toklist, result): """ Parse the token list as a query, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] # print("Query tree:\n{0}".format(tree_string)) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True def execute(self): """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False self._error = None self._qtype = None # Process the tree, which has only one sentence self._tree.process(self._session, _THIS_MODULE, query=self) return self._error is None def set_qtype(self, qtype): """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """ self._qtype = qtype def set_answer(self, answer): """ Set the answer to the query """ self._answer = answer def set_key(self, key): """ Set the query key, i.e. the term or string used to execute the query """ # This is for instance a person name in nominative case self._key = key def set_error(self, error): """ Set an error result """ self._error = error def qtype(self): """ Return the query type """ return self._qtype def answer(self): """ Return the query answer """ return self._answer def key(self): """ Return the query key """ return self._key def token_list(self): """ Return the token list for the query """ return self._toklist def error(self): """ Return the query error, if any """ return self._error
class Query: """ A Query is initialized by parsing a query string using QueryRoot as the grammar root nonterminal. The Query can then be executed by processing the best parse tree using the nonterminal handlers given above, returning a result object if successful. """ _parser = None _processors = [] _help_texts = dict() def __init__(self, session, query, voice, auto_uppercase, location, client_id): q = self._preprocess_query_string(query) self._session = session self._query = q or "" self._location = location # Prepare a "beautified query" string that can be # shown in a client user interface. By default, this # starts with an uppercase letter and ends with a # question mark, but this can be modified during the # processing of the query. self.set_beautified_query(beautify_query(q)) self._voice = voice self._auto_uppercase = auto_uppercase self._error = None # A detailed answer, which can be a list or a dict self._response = None # A single "best" displayable text answer self._answer = None # A version of self._answer that can be # fed to a voice synthesizer self._voice_answer = None self._tree = None self._qtype = None self._key = None self._toklist = None # Expiration timestamp, if any self._expires = None # URL assocated with query, can be set by query response handler # and subsequently provided to the remote client self._url = None # Client id, if known self._client_id = client_id # Source of answer to query self._source = None # Query context, which is None until fetched via self.fetch_context() # This should be a dict that can be represented in JSON self._context = None def _preprocess_query_string(self, q): """ Preprocess the query string prior to further analysis """ if not q: return q qf = re.sub(_IGNORED_PREFIX_RE, "", q, flags=re.IGNORECASE) # If stripping the prefixes results in an empty query, # just return original query string unmodified. return qf or q @classmethod def init_class(cls): """ Initialize singleton data, i.e. the list of query processor modules and the query parser instance """ procs = [] # Load the query processor modules found in the # queries directory modnames = modules_in_dir("queries") for modname in sorted(modnames): try: m = importlib.import_module(modname) procs.append(m) except ImportError as e: logging.error( "Error importing query processor module {0}: {1}".format( modname, e)) cls._processors = procs # Obtain query grammar fragments from those processors # that handle parse trees. Also collect topic lemmas that # can be used to provide context-sensitive help texts # when queries cannot be parsed. grammar_fragments = [] help_texts = defaultdict(list) for processor in procs: handle_tree = getattr(processor, "HANDLE_TREE", None) if handle_tree: # Check whether this processor supplies # a query grammar fragment fragment = getattr(processor, "GRAMMAR", None) if fragment and isinstance(fragment, str): # Looks legit: add it to our list grammar_fragments.append(fragment) # Collect topic lemmas and corresponding help text functions topic_lemmas = getattr(processor, "TOPIC_LEMMAS", None) if topic_lemmas: help_text_func = getattr(processor, "help_text", None) # If topic lemmas are given, a help_text function # should also be present assert help_text_func is not None if help_text_func is not None: for lemma in topic_lemmas: help_texts[lemma].append(help_text_func) cls._help_texts = help_texts # Coalesce the grammar additions from the fragments grammar_additions = "\n".join(grammar_fragments) # Initialize a singleton parser instance for queries, # with the nonterminal 'QueryRoot' as the grammar root cls._parser = QueryParser(grammar_additions) @staticmethod def _parse(toklist): """ Parse a token list as a query """ bp = Query._parser num_sent = 0 num_parsed_sent = 0 rdc = Reducer(bp.grammar) trees = dict() sent = [] for t in toklist: if t[0] == TOK.S_BEGIN: sent = [] elif t[0] == TOK.S_END: slen = len(sent) if not slen: continue num_sent += 1 # Parse the accumulated sentence num = 0 try: # Parse the sentence forest = bp.go(sent) if forest is not None: num = Fast_Parser.num_combinations(forest) if num > 1: # Reduce the resulting forest forest = rdc.go(forest) except ParseError: forest = None if num > 0: num_parsed_sent += 1 # Obtain a text representation of the parse tree trees[num_sent] = ParseForestDumper.dump_forest(forest) elif t[0] == TOK.P_BEGIN: pass elif t[0] == TOK.P_END: pass else: sent.append(t) result = dict(num_sent=num_sent, num_parsed_sent=num_parsed_sent) return result, trees def parse(self, result): """ Parse the query from its string, returning True if valid """ self._tree = None # Erase previous tree, if any self._error = None # Erase previous error, if any self._qtype = None # Erase previous query type, if any self._key = None self._toklist = None q = self._query.strip() if not q: self.set_error("E_EMPTY_QUERY") return False toklist = tokenize(q, auto_uppercase=self._auto_uppercase and q.islower()) toklist = list(toklist) # The following seems not to be needed and may complicate things # toklist = list(recognize_entities(toklist, enclosing_session=self._session)) actual_q = correct_spaces(" ".join(t.txt for t in toklist if t.txt)) if actual_q: actual_q = actual_q[0].upper() + actual_q[1:] if not any(actual_q.endswith(s) for s in ("?", ".", "!")): actual_q += "?" # Update the beautified query string, as the actual_q string # probably has more correct capitalization self.set_beautified_query(actual_q) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) parse_result, trees = Query._parse(toklist) if not trees: # No parse at all self.set_error("E_NO_PARSE_TREES") return False result.update(parse_result) if result["num_sent"] != 1: # Queries must be one sentence self.set_error("E_MULTIPLE_SENTENCES") return False if result["num_parsed_sent"] != 1: # Unable to parse the single sentence self.set_error("E_NO_PARSE") return False if 1 not in trees: # No sentence number 1 self.set_error("E_NO_FIRST_SENTENCE") return False # Looks good # Store the resulting parsed query as a tree tree_string = "S1\n" + trees[1] if Settings.DEBUG: print(tree_string) self._tree = Tree() self._tree.load(tree_string) # Store the token list self._toklist = toklist return True def execute_from_plain_text(self): """ Attempt to execute a plain text query, without having to parse it """ if not self._query: return False for processor in self._processors: handle_plain_text = getattr(processor, "handle_plain_text", None) if handle_plain_text is not None: # This processor has a handle_plain_text function: # call it if handle_plain_text(self): # Successfully handled: we're done return True return False def execute_from_tree(self): """ Execute the query contained in the previously parsed tree; return True if successful """ if self._tree is None: self.set_error("E_QUERY_NOT_PARSED") return False for processor in self._processors: self._error = None self._qtype = None # If a processor defines HANDLE_TREE and sets it to # a truthy value, it wants to handle parse trees handle_tree = getattr(processor, "HANDLE_TREE", None) if handle_tree: # Process the tree, which has only one sentence self._tree.process(self._session, processor, query=self) if self._answer and self._error is None: # The processor successfully answered the query return True # No processor was able to answer the query return False def last_answer(self, *, within_minutes=5): """ Return the last answer given to this client, by default within the last 5 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.answer, QueryRow.voice).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria last = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() return None if last is None else tuple(last) def fetch_context(self, *, within_minutes=10): """ Return the context from the last answer given to this client, by default within the last 10 minutes (0=forever) """ if not self._client_id: # Can't find the last answer if no client_id given return None # Find the newest non-error, no-repeat query result for this client q = (self._session.query(QueryRow.context).filter( QueryRow.client_id == self._client_id).filter( QueryRow.qtype != "Repeat").filter(QueryRow.error == None)) if within_minutes > 0: # Apply a timestamp filter since = datetime.utcnow() - timedelta(minutes=within_minutes) q = q.filter(QueryRow.timestamp >= since) # Sort to get the newest query that fulfills the criteria ctx = q.order_by(desc(QueryRow.timestamp)).limit(1).one_or_none() if ctx is None: return None # This function normally returns a dict that has been decoded from JSON return None if ctx is None else ctx[0] @property def query(self): return self._query @property def query_lower(self): return self._query.lower() @property def beautified_query(self): """ Return the query string that will be reflected back to the client """ return self._beautified_query def set_beautified_query(self, q): """ Set the query string that will be reflected back to the client """ self._beautified_query = ( q.replace("embla", "Embla").replace("miðeind", "Miðeind").replace( "Guðni Th ", "Guðni Th. ") # By presidential request :) ) def lowercase_beautified_query(self): """ If we know that no uppercase words occur in the query, except the initial capital, this function can be called to adjust the beautified query string accordingly. """ self.set_beautified_query(self._beautified_query.capitalize()) def query_is_command(self): """ Called from a query processor if the query is a command, not a question """ # Put a period at the end of the beautified query text # instead of a question mark if self._beautified_query.endswith("?"): self._beautified_query = self._beautified_query[:-1] + "." @property def expires(self): """ Expiration time stamp for this query answer, if any """ return self._expires def set_expires(self, ts): self._expires = ts @property def url(self): """ URL answer associated with this query """ return self._url def set_url(self, u): self._url = u @property def source(self): """ Source of answer to this query """ return self._source def set_source(self, s): self._source = s @property def location(self): return self._location @property def token_list(self): return self._toklist def set_qtype(self, qtype): """ Set the query type ('Person', 'Title', 'Company', 'Entity'...) """ self._qtype = qtype def set_answer(self, response, answer, voice_answer=None): """ Set the answer to the query """ # Detailed response (this is usually a dict) self._response = response # Single best answer, as a displayable string self._answer = answer # A voice version of the single best answer self._voice_answer = voice_answer def set_key(self, key): """ Set the query key, i.e. the term or string used to execute the query """ # This is for instance a person name in nominative case self._key = key def set_error(self, error): """ Set an error result """ self._error = error def qtype(self): """ Return the query type """ return self._qtype @property def is_voice(self): """ Return True if this is a voice query """ return self._voice def response(self): """ Return the detailed query answer """ return self._response def answer(self): """ Return the 'single best' displayable query answer """ return self._answer def voice_answer(self): """ Return a voice version of the 'single best' answer, if any """ return self._voice_answer def key(self): """ Return the query key """ return self._key def error(self): """ Return the query error, if any """ return self._error def set_context(self, ctx): """ Set a query context that will be stored and made available to the next query from the same client """ self._context = ctx @property def context(self): """ Return the context that has been set by self.set_context() """ return self._context @classmethod def try_to_help(cls, query, result): """ Attempt to help the user in the case of a failed query, based on lemmas in the query string """ # Collect a set of lemmas that occur in the query string lemmas = set() with BIN_Db.get_db() as db: for token in query.lower().split(): if token.isalpha(): m = db.meanings(token) if not m: # Try an uppercase version, just in case (pun intended) m = db.meanings(token.capitalize()) if m: lemmas |= set(mm.stofn.lower() for mm in m) # Collect a list of potential help text functions from the query modules help_text_funcs = [] for lemma in lemmas: help_text_funcs.extend([ (lemma, help_text_func) for help_text_func in cls._help_texts.get(lemma, []) ]) if help_text_funcs: # Found at least one help text func matching a lemma in the query # Select a function at random and invoke it with the matched # lemma as a parameter lemma, help_text_func = random.choice(help_text_funcs) result["answer"] = result["voice"] = help_text_func(lemma) result["valid"] = True def execute(self): """ Check whether the parse tree is describes a query, and if so, execute the query, store the query answer in the result dictionary and return True """ if Query._parser is None: Query.init_class() # By default, the result object contains the 'raw' query # string (the one returned from the speech-to-text processor) # as well as the beautified version of that string - which # usually starts with an uppercase letter and has a trailing # question mark (or other ending punctuation). result = dict(q_raw=self.query, q=self.beautified_query) # First, try to handle this from plain text, without parsing: # shortcut to a successful, plain response if not self.execute_from_plain_text(): if not self.parse(result): # Unable to parse the query if Settings.DEBUG: print("Unable to parse query, error {0}".format( self.error())) result["error"] = self.error() result["valid"] = False return result if not self.execute_from_tree(): # This is a query, but its execution failed for some reason: # return the error # if Settings.DEBUG: # print("Unable to execute query, error {0}".format(q.error())) result["error"] = self.error() or "E_UNABLE_TO_EXECUTE_QUERY" result["valid"] = True return result # Successful query: return the answer in response if self._answer: result["answer"] = self._answer if self._voice and self._voice_answer: # This is a voice query and we have a voice answer to it result["voice"] = self._voice_answer if self._voice: # Optimize the response to voice queries: # we don't need detailed information about alternative # answers or their sources result["response"] = dict(answer=self._answer or "") else: # Return a detailed response if not a voice query result["response"] = self._response # Re-assign the beautified query string, in case the query processor modified it result["q"] = self.beautified_query # ...and the query type, as a string ('Person', 'Entity', 'Title' etc.) result["qtype"] = qt = self.qtype() # ...and the key used to retrieve the answer, if any result["key"] = self.key() # ...and a URL, if any has been set by the query processor if self.url: result["open_url"] = self.url # .. and the source, if set by query processor if self.source: result["source"] = self.source if not self._voice and qt == "Person": # For a person query, add an image (if available) img = get_image_url(self.key(), enclosing_session=self._session) if img is not None: result["image"] = dict( src=img.src, width=img.width, height=img.height, link=img.link, origin=img.origin, name=img.name, ) result["valid"] = True if Settings.DEBUG: # Dump query results to the console def converter(o): """ Ensure that datetime is output in ISO format to JSON """ if isinstance(o, datetime): return o.isoformat()[0:16] return None print("{0}".format( json.dumps(result, indent=3, ensure_ascii=False, default=converter))) return result