def news(): """ Handler for a page with a top news list """ topic = request.args.get("topic") try: offset = max(0, int(request.args.get("offset", 0))) limit = max(0, int(request.args.get("limit", _TOP_NEWS_LENGTH))) except: offset = 0 limit = _TOP_NEWS_LENGTH limit = min(limit, 100) # Cap at max 100 results per page articles = top_news(topic=topic, offset=offset, limit=limit) # If all articles in the list are timestamped within 24 hours of now, # we display their times in HH:MM format. Otherwise, we display date. display_time = True if articles and (datetime.utcnow() - articles[-1].timestamp).days >= 1: display_time = False # Fetch the topics with SessionContext(commit=True) as session: q = session.query(Topic.identifier, Topic.name).order_by(Topic.name).all() d = {t[0]: t[1] for t in q} topics = dict(id=topic, name=d.get(topic, ""), topic_list=q) return render_template( "news.html", articles=articles, topics=topics, display_time=display_time, offset=offset, limit=limit, )
def fetch_url_html(cls, url, enclosing_session=None): """ Fetch a URL using the scraping mechanism, returning a tuple (html, metadata, helper) or None if error """ with SessionContext(enclosing_session) as session: helper = cls.helper_for(session, url) if helper is None or not hasattr(helper, "fetch_url"): # Do a straight HTTP fetch html_doc = cls._fetch_url(url) else: # Hand off to the helper html_doc = helper.fetch_url(url) if not html_doc: return (None, None, None) # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: print("Fetcher.fetch_url_html(): No soup") return (None, None, None) # Obtain the metadata from the resulting soup metadata = helper.get_metadata(soup) if helper else None return (html_doc, metadata, helper)
def token_stream(limit=None, skip_errors=True): """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def sentence_stream(limit=None, skip=None, skip_errors=True): """ Generator of a sentence stream consisting of `limit` sentences (or less) from the most recently parsed articles. Each sentence is a list of token dicts. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 skipped = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue if skip is not None and skipped < skip: # If requested, skip sentences from the front (useful for test set) skipped += 1 continue # Yield the sentence as a fresh token list yield [t for t in sent] # Are we done? count += 1 if limit is not None and count >= limit: return
def __iter__(self): """ Iterate through articles (documents) """ print("Starting iteration through corpus from words table") if self._dictionary is not None: xform = lambda x: self._dictionary.doc2bow(x) else: xform = lambda x: x with SessionContext(commit=True) as session: # Fetch bags of words sorted by articles q = session.query(Word.article_id, Word.stem, Word.cat, Word.cnt) \ .order_by(Word.article_id).yield_per(2000) bag = [] last_uuid = None for uuid, stem, cat, cnt in q: if uuid != last_uuid: if bag: # Finishing the last article: yield its bag # print("Yielding bag of {0} words".format(len(bag))) yield xform(bag) bag = [] # Beginning a new article with an empty bag last_uuid = uuid # Convert stem to lowercase and replace spaces with underscores w = w_from_stem(stem, cat) if cnt == 1: bag.append(w) else: bag.extend([w] * cnt) if last_uuid is not None: # print("Yielding bag of {0} words".format(len(bag))) yield xform(bag) print("Finished iteration through corpus from words table")
def dump_tokens(limit): """ Iterate through parsed articles and print a list of tokens and their matched terminals """ dtd = dict() with closing(BIN_Db.get_db()) as db: with SessionContext(commit = True) as session: # Iterate through the articles q = session.query(Article) \ .filter(Article.tree != None) \ .order_by(Article.timestamp) if limit is None: q = q.all() else: q = q[0:limit] for a in q: print("\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}".format(a)) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.sentences(): print("\nSentence {0}:".format(ix)) at_start = True for t in toklist: if t.tokentype == "WORD": wrd = t.token[1:-1] td = dtd.get(t.terminal) if td is None: td = TerminalDescriptor(t.terminal) dtd[t.terminal] = td stem = td.stem(db, wrd, at_start) at_start = False print(" {0} {1} {2}".format(wrd, stem, t.terminal)) else: print(" {0.token} {0.cat} {0.terminal}".format(t))
def refresh_topics(self): """ Load any new article topics into the _atopics dict """ with self._lock: with SessionContext(commit=True, read_only=True) as session: # Do the next refresh from this time point ts = datetime.utcnow() q = session.query(Article).join(Root).filter(Root.visible) \ .filter(Article.indexed >= self._timestamp) \ .with_entities(Article.id, Article.topic_vector) self._timestamp = ts count = 0 for a in q.yield_per(100): if a.topic_vector: # Load topic vector in to a numpy array vec = json.loads(a.topic_vector) if isinstance( vec, list) and len(vec) == self._corpus.dimensions: self._atopics[a.id] = np.array(vec) count += 1 else: print( "Warning: faulty topic vector for article {0}". format(a.id)) print("Completed refresh_topics, {0} article vectors added". format(count))
def reparse_api(version=1): """ Reparse an already parsed and stored article with a given UUID """ if not (1 <= version <= 1): return better_jsonify(valid="False", reason="Unsupported version") uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH] tokens = None register = {} stats = {} with SessionContext(commit=True) as session: # Load the article a = ArticleProxy.load_from_uuid(uuid, session) if a is not None: # Found: Parse it (with a fresh parser) and store the updated version a.parse(session, verbose=True, reload_parser=True) # Save the tokens tokens = a.tokens # Build register of person names register = a.create_register(session) stats = dict( num_tokens=a.num_tokens, num_sentences=a.num_sentences, num_parsed=a.num_parsed, ambiguity=a.ambiguity, ) # Return the tokens as a JSON structure to the client, # along with a name register and article statistics return better_jsonify(valid=True, result=tokens, register=register, stats=stats)
def news(): """ Handler for a page with a top news list """ topic = request.args.get("topic") start = request.args.get("start") if start is not None: try: if '.' in start: # Assume full timestamp with microseconds start = datetime.strptime(start, "%Y-%m-%dT%H:%M:%S.%f") else: # Compact timestamp start = datetime.strptime(start, "%Y-%m-%dT%H:%M:%S") except ValueError: start = None articles = top_news(topic=topic, start=start) now = datetime.utcnow() # If all articles in the list are timestamped within 24 hours of now, # we display their times in HH:MM format. Otherwise, we display their # dates in YYYY-MM-DD format. display_time = True if articles and (now - articles[-1].timestamp).days >= 1: display_time = False # Fetch the topics with SessionContext(commit=True) as session: q = session.query(Topic.identifier, Topic.name).order_by(Topic.name).all() d = {t[0]: t[1] for t in q} topics = dict(identifier=topic, name=d.get(topic, ""), topic_list=q) return render_template("news.html", articles=articles, topics=topics, display_time=display_time)
def _load_topics(self): """ Load all article topics into the self._atopics dictionary """ self._atopics = {} with SessionContext(commit=True, read_only=True) as session: print("Starting load of all article topic vectors") t0 = time.time() # Do the next refresh from this time point self._timestamp = datetime.utcnow() q = session.query(Article).join(Root).filter(Root.visible) \ .with_entities(Article.id, Article.topic_vector) for a in q.yield_per(2000): if a.topic_vector: # Load topic vector in to a numpy array vec = json.loads(a.topic_vector) if isinstance( vec, list) and len(vec) == self._corpus.dimensions: self._atopics[a.id] = np.array(vec) else: print("Warning: faulty topic vector for article {0}". format(a.id)) t1 = time.time() print("Loading of {0} topic vectors completed in {1:.2f} seconds". format(len(self._atopics), t1 - t0))
def query(): """ Respond to a query string """ q = request.form.get("q", "").strip()[0:_MAX_QUERY_LENGTH] # Auto-uppercasing can be turned off by sending autouppercase: false in the query JSON auto_uppercase = get_json_bool(request, "autouppercase", True) result = dict() with SessionContext(commit=True) as session: toklist = list( tokenize(q, enclosing_session=session, auto_uppercase=q.islower() if auto_uppercase else False)) actual_q = correct_spaces(" ".join(t.txt or "" for t in toklist)) if Settings.DEBUG: # Log the query string as seen by the parser print("Query is: '{0}'".format(actual_q)) # Try to parse and process as a query is_query = process_query(session, toklist, result) result["is_query"] = is_query result["q"] = actual_q return jsonify(result=result)
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() bindb = BIN_Db.get_db() with SessionContext(commit=True) as session: q = session.query(Person.name, Person.title, Person.article_url, Article.id) \ .join(Article).join(Root) \ .filter(Root.visible) \ .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or len(p.title) > len(toplist[p.name][0]): toplist[p.name] = (correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name)) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted([ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]))
def _get_cached_entry(name, url, enclosing_session=None): """ Fetch cached entry by key and url """ with SessionContext(commit=True, session=enclosing_session) as session: # TODO: content column should be converted to jsonb # from varchar to query faster & more intelligently return (session.query(Link).filter(Link.key == name).filter( Link.content.like("%" + url + "%")).one_or_none())
def fetch_article(cls, url, enclosing_session=None): """ Fetch a previously scraped article, returning a tuple (article, metadata, content) or None if error """ with SessionContext(enclosing_session) as session: article = cls.find_article(url, session) if article is None: return (None, None, None) html_doc = article.html if not html_doc: return (None, None, None) helper = cls.helper_for(session, url) # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: print("Fetcher.fetch_article(): No soup") return (None, None, None) # Obtain the metadata and the content from the resulting soup metadata = helper.get_metadata(soup) if helper else None content = helper.get_content(soup) if helper else soup.html.body return (article, metadata, content)
def fetch_url(cls, url, enclosing_session=None): """ Fetch a URL using the scraping mechanism, returning a tuple (metadata, content) or None if error """ with SessionContext(enclosing_session) as session: helper = cls.helper_for(session, url) if helper is None or not hasattr(helper, "fetch_url"): # Do a straight HTTP fetch html_doc = cls.raw_fetch_url(url) else: # Hand off to the helper html_doc = helper.fetch_url(url) if not html_doc: return None # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: print("Fetcher.fetch_url({0}): No soup or no soup.html".format( url)) return None # Obtain the metadata and the content from the resulting soup metadata = helper.get_metadata(soup) if helper else None content = helper.get_content(soup) if helper else soup.html.body return (metadata, content)
def parse_api(version=1): """ API to parse text and return POS tagged tokens in JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.parse_text(session, text, all_names=True) # In this case, we should always get a single paragraph back if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pgs = pgs[0] else: # More than one paragraph: gotta concatenate 'em all pa = [] for pg in pgs: pa.extend(pg) pgs = pa # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def _blacklisted_urls_for_key(key, enclosing_session=None): """ Fetch blacklisted urls for a given key """ with SessionContext(commit=True, session=enclosing_session) as session: q = (session.query(BlacklistedLink.url).filter( BlacklistedLink.link_type == "image").filter( BlacklistedLink.key == key).all()) return [r for (r, ) in q]
def find_article(cls, url, enclosing_session=None): """ Return a scraped article object, if found, else None """ article = None with SessionContext(enclosing_session, commit=True) as session: article = session.query(ArticleRow).filter_by(url = url) \ .filter(ArticleRow.scraped != None).one_or_none() return article
def reparse(): """ Reparse an already parsed and stored article with a given UUID """ uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH] tokens = None register = {} stats = {} with SessionContext(commit=True) as session: # Load the article a = ArticleProxy.load_from_uuid(uuid, session) if a is not None: # Found: Parse it (with a fresh parser) and store the updated version a.parse(session, verbose=True, reload_parser=True) # Save the tokens tokens = a.tokens # Build register of person names for name in a.person_names(): add_name_to_register(name, register, session) # Add register of entity names for name in a.entity_names(): add_entity_to_register(name, register, session) stats = dict(num_tokens=a.num_tokens, num_sentences=a.num_sentences, num_parsed=a.num_parsed, ambiguity=a.ambiguity) # Return the tokens as a JSON structure to the client, # along with a name register and article statistics return jsonify(result=tokens, register=register, stats=stats)
def _purge_single(key, ctype=None, enclosing_session=None): """ Remove cache entry """ with SessionContext(commit=True, session=enclosing_session) as session: filters = [Link.key == key] if ctype: filters.append(Link.ctype == ctype) session.query(Link).filter(*filters).delete()
def assign_article_topics(self, article_id, heading): """ Assign the appropriate topics to the given article in the database """ if self._dictionary is None: self.load_dictionary() if self._tfidf is None: self.load_tfidf_model() if self._model is None: self.load_lda_model() if self._topics is None: self.load_topics() with SessionContext(commit=True) as session: q = session.query(Word.stem, Word.cat, Word.cnt) \ .filter(Word.article_id == article_id).all() wlist = [] for stem, cat, cnt in q: # Convert stem to lowercase and replace spaces with underscores w = stem.lower().replace(" ", "_") + "/" + cat if cnt == 1: wlist.append(w) else: wlist.extend([w] * cnt) topics = [] if self._topics and wlist: bag = self._dictionary.doc2bow(wlist) tfidf = self._tfidf[bag] article_vector = self._model[tfidf] topic_names = [] if self._verbose: print("{0} : {1}".format(article_id, heading)) for topic_id, topic_info in self._topics.items(): topic_name = topic_info["name"] topic_vector = topic_info["vector"] topic_threshold = topic_info["threshold"] # Calculate the cosine similarity betwee the article and the topic similarity = matutils.cossim(article_vector, topic_vector) if self._verbose: print(" Similarity to topic {0} is {1:.3f}".format( topic_name, similarity)) if similarity >= topic_threshold: # Similar enough: this is a topic of the article topics.append(topic_id) topic_names.append((topic_name, similarity)) if topic_names: print("Article '{0}': topics {1}".format( heading, topic_names)) # Topics found (if any): delete previous ones (if any) session.execute(ArticleTopic.table().delete().where( ArticleTopic.article_id == article_id)) # ...and add the new ones for topic_id in topics: session.add( ArticleTopic(article_id=article_id, topic_id=topic_id)) # Update the indexed timestamp a = session.query(Article).filter( Article.id == article_id).one_or_none() if a: a.indexed = datetime.utcnow()
def load_from_url(cls, url, enclosing_session=None): """ Load or scrape an article, given its URL """ with SessionContext(enclosing_session) as session: ar = session.query(ArticleRow).filter( ArticleRow.url == url).one_or_none() if ar is not None: return cls._init_from_row(ar) # Not found in database: attempt to fetch return cls._init_from_scrape(url, session)
def load_topics(self): """ Load the topics into a dict of topic vectors by topic id """ self._topics = { } with SessionContext(commit = True) as session: for topic in session.query(Topic).all(): if topic.vector: topic_vector = json.loads(topic.vector)[self._model_name] if topic_vector: self._topics[topic.id] = dict(name = topic.name, vector = topic_vector, threshold = topic.threshold)
def load_from_uuid(cls, uuid, enclosing_session=None): """ Load an article, given its UUID """ with SessionContext(enclosing_session) as session: try: ar = session.query(ArticleRow).filter( ArticleRow.id == uuid).one_or_none() except DataError: # Probably wrong UUID format ar = None return None if ar is None else cls._init_from_row(ar)
def scrape_from_url(cls, url, enclosing_session=None): """ Force fetch of an article, given its URL """ with SessionContext(enclosing_session) as session: ar = session.query(ArticleRow).filter( ArticleRow.url == url).one_or_none() a = cls._init_from_scrape(url, session) if a is not None and ar is not None: # This article already existed in the database, so note its UUID a._uuid = ar.id return a
def top_persons(limit=_TOP_PERSONS_LENGTH): """ Return a list of names and titles appearing recently in the news """ toplist = dict() MAX_TITLE_LENGTH = 64 with SessionContext(commit=True) as session: q = ( session.query(Person.name, Person.title, Person.article_url, Article.id).join(Article).join(Root).filter( Root.visible).order_by(desc(Article.timestamp)) [0:limit * 2] # Go through up to 2 * N records ) def is_better_title(new_title, old_title): len_new = len(new_title) len_old = len(old_title) if len_old >= MAX_TITLE_LENGTH: # Too long: we want a shorter one return len_new < len_old if len_new >= MAX_TITLE_LENGTH: # This one is too long: we don't want it return False # Otherwise, longer is better return len_new > len_old with BIN_Db.get_db() as bindb: for p in q: # Insert the name into the list if it's not already there, # or if the new title is longer than the previous one if p.name not in toplist or is_better_title( p.title, toplist[p.name][0]): toplist[p.name] = ( correct_spaces(p.title), p.article_url, p.id, bindb.lookup_name_gender(p.name), ) if len(toplist) >= limit: # We now have as many names as we initially wanted: terminate the loop break with changedlocale() as strxfrm: # Convert the dictionary to a sorted list of dicts return sorted( [ dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2]) for name, tu in toplist.items() ], key=lambda x: strxfrm(x["name"]), )
def page(): """ Handler for a page displaying the parse of an arbitrary web page by URL or an already scraped article by UUID """ url = request.args.get("url", None) uuid = request.args.get("id", None) if url: url = url.strip()[0:_MAX_URL_LENGTH] if uuid: uuid = uuid.strip()[0:_MAX_UUID_LENGTH] if url: # URL has priority, if both are specified uuid = None if not url and not uuid: # !!! TODO: Separate error page return redirect(url_for('main')) with SessionContext(commit=True) as session: if uuid: a = ArticleProxy.load_from_uuid(uuid, session) elif url.startswith("http:") or url.startswith("https:"): # a = ArticleProxy.load_from_url(url, session) a = ArticleProxy.scrape_from_url(url, session) # Forces a new scrape else: a = None if a is None: # !!! TODO: Separate error page return redirect(url_for('main')) # Prepare the article for display (may cause it to be parsed and stored) a.prepare(session, verbose=True, reload_parser=True) register = {} # Build register of person names for name in a.person_names(): add_name_to_register(name, register, session) # Add register of entity names for name in a.entity_names(): add_entity_to_register(name, register, session) # Fetch names of article topics, if any topics = session.query(ArticleTopic) \ .filter(ArticleTopic.article_id == a.uuid).all() topics = [ dict(name=t.topic.name, identifier=t.topic.identifier) for t in topics ] return render_template("page.html", article=a, register=register, topics=topics)
def make_trigrams(limit): """ Iterate through parsed articles and extract trigrams from successfully parsed sentences """ with SessionContext(commit=True) as session: # Delete existing trigrams Trigram.delete_all(session) # Iterate through the articles q = session.query(Article.url, Article.timestamp, Article.tree) \ .filter(Article.tree != None) \ .order_by(Article.timestamp) if limit is None: q = q.yield_per(200) else: q = q[0:limit] def tokens(q): """ Generator for token stream """ for a in q: print( "Processing article from {0.timestamp}: {0.url}".format(a)) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.sentences(): if toklist: # For each sentence, start and end with empty strings yield "" yield "" for t in toklist: yield t.token[1:-1] yield "" yield "" def trigrams(iterable): return zip(*((islice(seq, i, None) for i, seq in enumerate(tee(iterable, 3))))) FLUSH_THRESHOLD = 0 # 200 # Flush once every 200 records cnt = 0 for tg in trigrams(tokens(q)): # print("{0}".format(tg)) if any(w for w in tg): try: Trigram.upsert(session, *tg) cnt += 1 if cnt == FLUSH_THRESHOLD: session.flush() cnt = 0 except DatabaseError as ex: print("*** Exception {0} on trigram {1}, skipped".format( ex, tg))
def parse(self, enclosing_session=None, verbose=False, reload_parser=False): """ Force a parse of the article """ with SessionContext(enclosing_session, commit=True) as session: if reload_parser: # We need a parse: Make sure we're using the newest grammar self.reload_parser() self._parse(session, verbose=verbose) if self._tree is not None or self._tokens is not None: # Store the updated article in the database self.store(session)
def page(): """ Handler for a page displaying the parse of an arbitrary web page by URL or an already scraped article by UUID """ url = request.args.get("url", None) uuid = request.args.get("id", None) if url: url = url.strip()[0:_MAX_URL_LENGTH] if uuid: uuid = uuid.strip()[0:_MAX_UUID_LENGTH] if url: # URL has priority, if both are specified uuid = None if not url and not uuid: # !!! TODO: Separate error page return redirect(url_for("main")) with SessionContext(commit=True) as session: if uuid: a = ArticleProxy.load_from_uuid(uuid, session) elif url.startswith("http:") or url.startswith("https:"): # a = ArticleProxy.load_from_url(url, session) a = ArticleProxy.scrape_from_url(url, session) # Forces a new scrape else: a = None if a is None: # !!! TODO: Separate error page return redirect(url_for("main")) # Prepare the article for display (may cause it to be parsed and stored) a.prepare(session, verbose=True, reload_parser=True) register = a.create_register(session) # Fetch names of article topics, if any topics = (session.query(ArticleTopic).filter( ArticleTopic.article_id == a.uuid).all()) topics = [ dict(name=t.topic.name, id=t.topic.identifier) for t in topics ] # Fetch similar (related) articles, if any DISPLAY = 10 # Display at most 10 matches similar = Search.list_similar_to_article(session, a.uuid, n=DISPLAY) return render_template("page.html", article=a, register=register, topics=topics, similar=similar)