def all_matches( cls, criteria: Mapping[str, Any], pattern: str, enclosing_session: Optional[Session] = None, ) -> Iterator[Tuple["Article", int, SimpleTree]]: """ Generator of SimpleTree objects (see matcher.py) from articles matching the given criteria and the pattern """ with SessionContext(commit=True, read_only=True, session=enclosing_session) as session: # t0 = time.time() mcnt = acnt = tcnt = 0 # print("Starting article loop") for a in cls.articles(criteria, enclosing_session=session): if a.tree is None: continue acnt += 1 tree = Tree(url=a.url or "", authority=a.authority) tree.load(a.tree) for ix, simple_tree in tree.simple_trees(): tcnt += 1 for match in simple_tree.all_matches(pattern): yield (a, ix, match) mcnt += 1
def token_stream(limit=None, skip_errors=True): """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def fetch_url_html(cls, url, enclosing_session=None): """ Fetch a URL using the scraping mechanism, returning a tuple (html, metadata, helper) or None if error """ with SessionContext(enclosing_session) as session: helper = cls.helper_for(session, url) if helper is None or not hasattr(helper, "fetch_url"): # Do a straight HTTP fetch html_doc = cls.raw_fetch_url(url) else: # Hand off to the helper html_doc = helper.fetch_url(url) if not html_doc: return (None, None, None) # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: logging.warning( "Fetcher.fetch_url_html({0}): No soup".format(url)) return (None, None, None) # Obtain the metadata from the resulting soup metadata = helper.get_metadata(soup) if helper else None return (html_doc, metadata, helper)
def main(): try: # Read configuration file Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit=True) as session: # Zero sentences print("Deleting all articles with zero sentences") res = session.execute( ArticleModel.table().delete().where(ArticleModel.num_sentences == 0) ) print(str(res.rowcount) + " articles deleted") # Non-Icelandic # TODO: Implement me! # Duplicates # For each https article, check whether there is a corresponding # article URL with http URI scheme dupl = 0 q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%")) for r in q.all(): url = re.sub(r"^https://", r"http://", r.url) # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count() res = session.execute( ArticleModel.table().delete().where(ArticleModel.url == url) ) dupl += res.rowcount print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
def fetch_url(cls, url, enclosing_session=None): """ Fetch a URL using the scraping mechanism, returning a tuple (metadata, content) or None if error """ with SessionContext(enclosing_session) as session: helper = cls.helper_for(session, url) if helper is None or not hasattr(helper, "fetch_url"): # Do a straight HTTP fetch html_doc = cls.raw_fetch_url(url) else: # Hand off to the helper html_doc = helper.fetch_url(url) if not html_doc: return None # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: print("Fetcher.fetch_url({0}): No soup or no soup.html".format( url)) return None # Obtain the metadata and the content from the resulting soup metadata = helper.get_metadata(soup) if helper else None content = helper.get_content(soup) if helper else soup.html.body return (metadata, content)
def _load_topics(self): """ Load all article topics into the self._atopics dictionary """ self._atopics = {} with SessionContext(commit=True, read_only=True) as session: print("Starting load of all article topic vectors") t0 = time.time() # Do the next refresh from this time point self._timestamp = datetime.utcnow() q = (session.query(Article).join(Root).filter( Root.visible).with_entities(Article.id, Article.topic_vector)) for a in q.yield_per(2000): if a.topic_vector: # Load topic vector in to a numpy array vec = json.loads(a.topic_vector) if isinstance( vec, list) and len(vec) == self._corpus.dimensions: self._atopics[a.id] = np.array(vec) else: print("Warning: faulty topic vector for article {0}". format(a.id)) t1 = time.time() print("Loading of {0} topic vectors completed in {1:.2f} seconds". format(len(self._atopics), t1 - t0))
def store_query_data(client_id: str, key: str, data: ClientDataDict) -> bool: """ Save client query data in the database, under the given key """ if not client_id or not key: return False now = datetime.utcnow() try: with SessionContext(commit=True) as session: row = (session.query(QueryData).filter( QueryData.key == key).filter( QueryData.client_id == client_id)).one_or_none() if row is None: # Not already present: insert row = QueryData( client_id=client_id, key=key, created=now, modified=now, data=data, ) session.add(row) else: # Already present: update row.data = data # type: ignore row.modified = now # type: ignore # The session is auto-committed upon exit from the context manager return True except Exception as e: logging.error("Error storing query data in db: {0}".format(e)) return False
def _gen_most_freq_queries_answer(q): """ Answer question concerning most frequent queries. """ with SessionContext(read_only=True) as session: now = datetime.utcnow() start = now - timedelta(days=_QUERIES_PERIOD) end = now qr = QueryTypesQuery.period(start=start, end=end, enclosing_session=session) if qr: top_qtype = qr[0][1] desc = _QTYPE_TO_DESC.get( top_qtype) or "óskilgreindum fyrirspurnum" answer = "Undanfarið hef ég mest svarað {0}.".format(desc) else: answer = "Ég hef ekki svarað neinum fyrirspurnum upp á síðkastið." response = dict(answer=answer) voice = answer q.set_expires(now + timedelta(hours=1)) q.set_answer(response, answer, voice) q.set_qtype(_STATS_QTYPE) q.set_key("FreqQuery") return True
def refresh_topics(self): """ Load any new article topics into the _atopics dict """ with self._lock: with SessionContext(commit=True, read_only=True) as session: # Do the next refresh from this time point ts = datetime.utcnow() q = (session.query(Article).join(Root).filter( Root.visible).filter( Article.indexed >= self._timestamp).with_entities( Article.id, Article.topic_vector)) self._timestamp = ts count = 0 for a in q.yield_per(100): if a.topic_vector: # Load topic vector in to a numpy array vec = json.loads(a.topic_vector) if (isinstance(vec, list) and len(vec) == self._corpus.dimensions): self._atopics[a.id] = np.array(vec) count += 1 else: print( "Warning: faulty topic vector for article {0}". format(a.id)) print("Completed refresh_topics, {0} article vectors added". format(count))
def fetch_article(cls, url, enclosing_session=None): """ Fetch a previously scraped article, returning a tuple (article, metadata, content) or None if error """ with SessionContext(enclosing_session) as session: article = cls.find_article(url, session) if article is None: return (None, None, None) html_doc = article.html if not html_doc: return (None, None, None) helper = cls.helper_for(session, url) # Parse the HTML soup = Fetcher.make_soup(html_doc, helper) if soup is None: logging.warning( "Fetcher.fetch_article({0}): No soup".format(url)) return (None, None, None) # Obtain the metadata and the content from the resulting soup metadata = helper.get_metadata(soup) if helper else None content = helper.get_content(soup) if helper else soup.html.body return (article, metadata, content)
def postag_api(version=1): """ API to parse text and return POS tagged tokens in a verbose JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except Exception: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.tag_text(session, text, all_names=True) # Amalgamate the result into a single list of sentences pa: List[List[TokenDict]] = [] if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pa = pgs[0] else: # More than one paragraph: gotta concatenate 'em all for pg in pgs: pa.extend(pg) for sent in pa: # Transform the token representation into a # nice canonical form for outside consumption # err = any("err" in t for t in sent) for t in sent: canonicalize_token(t) # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pa, stats=stats, register=register)
def test_del_query_history(client): """ Test query history deletion API. """ with SessionContext(commit=False) as session: # If database contains the logged query "GREYNIR_TESTING" we know the # tests are running on the dummy data in tests/test_files/test_queries.csv. cnt = session.query(Query).filter(Query.question == "GREYNIR_TESTING").count() if not cnt == 1: return # Num queries in dummy test data TEST_EXPECTED_NUM_QUERIES = 6 # We expect one query with this client ID TEST_CLIENT_ID = "123456789" # Number of queries prior to API call pre_numq = session.query(Query).count() assert pre_numq == TEST_EXPECTED_NUM_QUERIES, "Malformed dummy test data" qstr = urlencode( {"action": "clear", "client_type": "some_type", "client_id": TEST_CLIENT_ID} ) _ = client.get("/query_history.api?" + qstr) post_numq = session.query(Query).count() assert post_numq == pre_numq - 1
def _get_cached_entry(name, url, enclosing_session=None): """ Fetch cached entry by key and url """ with SessionContext(commit=True, session=enclosing_session) as session: # TODO: content column should be converted to jsonb # from varchar to query faster & more intelligently return (session.query(Link).filter(Link.key == name).filter( Link.content.like("%" + url + "%")).one_or_none())
def _blacklisted_urls_for_key(key, enclosing_session=None): """ Fetch blacklisted urls for a given key """ with SessionContext(commit=True, session=enclosing_session) as session: q = (session.query(BlacklistedLink.url).filter( BlacklistedLink.link_type == "image").filter( BlacklistedLink.key == key).all()) return [r for (r, ) in q]
def _init_from_scrape(cls, url: Optional[str], enclosing_session: Optional[Session] = None): """ Scrape an article from its URL """ if url is None: return None a = cls(url=url) with SessionContext(enclosing_session) as session: # Obtain a helper corresponding to the URL html, metadata, helper = Fetcher.fetch_url_html(url, session) if html is None: return a a._html = html if metadata is not None: a._heading = metadata.heading a._author = metadata.author a._timestamp = metadata.timestamp a._authority = metadata.authority a._scraped = datetime.utcnow() if helper is not None: helper = cast(Any, helper) a._scr_module = helper.scr_module a._scr_class = helper.scr_class a._scr_version = helper.scr_version a._root_id = helper.root_id a._root_domain = helper.domain return a
def reparse_api(version=1): """ Reparse an already parsed and stored article with a given UUID """ if not (1 <= version <= 1): return better_jsonify(valid="False", reason="Unsupported version") uuid = request.form.get("id", "").strip()[0:_MAX_UUID_LENGTH] tokens = None register = {} stats = {} with SessionContext(commit=True) as session: # Load the article a = ArticleProxy.load_from_uuid(uuid, session) if a is not None: # Found: Parse it (with a fresh parser) and store the updated version a.parse(session, verbose=True, reload_parser=True) # Save the tokens tokens = a.tokens # Build register of person names register = a.create_register(session) stats = dict( num_tokens=a.num_tokens, num_sentences=a.num_sentences, num_parsed=a.num_parsed, ambiguity=a.ambiguity, ) # Return the tokens as a JSON structure to the client, # along with a name register and article statistics return better_jsonify(valid=True, result=tokens, register=register, stats=stats)
def feedback_api(version=1): """ Endpoint to accept submitted feedback forms. """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") name = request.values.get("name") email = request.values.get("email") comment = request.values.get("comment") topic = request.values.get("topic") if comment: with SessionContext(commit=True) as session: try: qrow = Feedback( timestamp=datetime.utcnow(), topic=topic, name=name, email=email, comment=comment, ) session.add(qrow) return better_jsonify(valid=True) except Exception as e: logging.error("Error saving feedback to db: {0}".format(e)) return better_jsonify(valid=False)
def parse_api(version=1): """ API to parse text and return POS tagged tokens in JSON format """ if not (1 <= version <= 1): # Unsupported version return better_jsonify(valid=False, reason="Unsupported version") try: text = text_from_request(request) except: return better_jsonify(valid=False, reason="Invalid request") with SessionContext(commit=True) as session: pgs, stats, register = TreeUtility.parse_text(session, text, all_names=True) # In this case, we should always get a single paragraph back if pgs: # Only process the first paragraph, if there are many of them if len(pgs) == 1: pgs = pgs[0] else: # More than one paragraph: gotta concatenate 'em all pa = [] for pg in pgs: pa.extend(pg) pgs = pa # Return the tokens as a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, register=register)
def dump_tokens(limit): """ Iterate through parsed articles and print a list of tokens and their matched terminals """ dtd = dict() with BIN_Db.get_db() as db, SessionContext(commit=True) as session: # Iterate through the articles q = (session.query(Article).filter(Article.tree != None).order_by( Article.timestamp)) if limit is None: q = q.all() else: q = q[0:limit] for a in q: print( "\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}" .format(a)) tree = TreeTokenList() tree.load(a.tree) for ix, toklist in tree.token_lists(): print("\nSentence {0}:".format(ix)) at_start = True for t in toklist: if t.tokentype == "WORD": wrd = t.token[1:-1] td = dtd.get(t.terminal) if td is None: td = TerminalDescriptor(t.terminal) dtd[t.terminal] = td stem = td.stem(db, wrd, at_start) at_start = False print(" {0} {1} {2}".format(wrd, stem, t.terminal)) else: print(" {0.token} {0.cat} {0.terminal}".format(t))
def sentence_stream(limit=None, skip=None, skip_errors=True): """ Generator of a sentence stream consisting of `limit` sentences (or less) from the most recently parsed articles. Each sentence is a list of token dicts. """ with SessionContext(commit=True, read_only=True) as session: q = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(ArticleRow.parsed)).yield_per(200)) count = 0 skipped = 0 for a in q: doc = json.loads(a.tokens) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue if skip is not None and skipped < skip: # If requested, skip sentences from the front # (useful for test set) skipped += 1 continue # Yield the sentence as a fresh token list yield [t for t in sent] # Are we done? count += 1 if limit is not None and count >= limit: return
def token_stream( limit: Optional[int] = None, skip_errors: bool = True) -> Iterator[Optional[TokenDict]]: """ Generator of a token stream consisting of `limit` sentences (or less) from the most recently parsed articles. After each sentence, None is yielded. """ with SessionContext(commit=True, read_only=True) as session: q: SqlQuery[ArticleRow] = (session.query( ArticleRow.url, ArticleRow.parsed, ArticleRow.tokens).filter(ArticleRow.tokens != None).order_by( desc(cast(Column, ArticleRow.parsed))).yield_per(200)) count = 0 for a in q: assert a is not None if not a.tokens: continue doc = cast(PgsList, json.loads(a.tokens)) for pg in doc: for sent in pg: if not sent: continue if skip_errors and any("err" in t for t in sent): # Skip error sentences continue for t in sent: # Yield the tokens yield t yield None # End-of-sentence marker # Are we done? count += 1 if limit is not None and count >= limit: return
def __iter__(self): """ Iterate through articles (documents) """ print("Starting iteration through corpus from words table") if self._dictionary is not None: xform = lambda x: self._dictionary.doc2bow(x) else: xform = lambda x: x with SessionContext(commit=True) as session: # Fetch bags of words sorted by articles q = (session.query(Word.article_id, Word.stem, Word.cat, Word.cnt).order_by( Word.article_id).yield_per(2000)) bag = [] last_uuid = None for uuid, stem, cat, cnt in q: if uuid != last_uuid: if bag: # Finishing the last article: yield its bag # print("Yielding bag of {0} words".format(len(bag))) yield xform(bag) bag = [] # Beginning a new article with an empty bag last_uuid = uuid # Convert stem to lowercase and replace spaces with underscores w = w_from_stem(stem, cat) if cnt == 1: bag.append(w) else: bag.extend([w] * cnt) if (last_uuid is not None) and bag: # print("Yielding bag of {0} words".format(len(bag))) yield xform(bag) print("Finished iteration through corpus from words table")
def article_api(version=1): """ Obtain information about an article, given its URL or id """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") url = request.values.get("url") uuid = request.values.get("id") if url: url = url.strip()[0:_MAX_URL_LENGTH] if uuid: uuid = uuid.strip()[0:_MAX_UUID_LENGTH] if url: # URL has priority, if both are specified uuid = None if not url and not uuid: return better_jsonify(valid=False, reason="No url or id specified in query") with SessionContext(commit=True) as session: if uuid: a = ArticleProxy.load_from_uuid(uuid, session) elif url.startswith("http:") or url.startswith("https:"): a = ArticleProxy.load_from_url(url, session) else: a = None if a is None: return better_jsonify(valid=False, reason="Article not found") if a.html is None: return better_jsonify(valid=False, reason="Unable to fetch article") # Prepare the article for display a.prepare(session) register = a.create_register(session, all_names=True) # Fetch names of article topics, if any topics = (session.query(ArticleTopic).filter( ArticleTopic.article_id == a.uuid).all()) topics = [ dict(name=t.topic.name, id=t.topic.identifier) for t in topics ] return better_jsonify( valid=True, url=a.url, id=a.uuid, heading=a.heading, author=a.author, ts=a.timestamp.isoformat()[0:19], num_sentences=a.num_sentences, num_parsed=a.num_parsed, ambiguity=a.ambiguity, register=register, topics=topics, )
def _purge_single(key, ctype=None, enclosing_session=None): """ Remove cache entry """ with SessionContext(commit=True, session=enclosing_session) as session: filters = [Link.key == key] if ctype: filters.append(Link.ctype == ctype) session.query(Link).filter(*filters).delete()
def load_from_url(cls, url, enclosing_session=None): """ Load or scrape an article, given its URL """ with SessionContext(enclosing_session) as session: ar = session.query(ArticleRow).filter(ArticleRow.url == url).one_or_none() if ar is not None: return cls._init_from_row(ar) # Not found in database: attempt to fetch return cls._init_from_scrape(url, session)
def top_locations(limit=_TOP_LOC_LENGTH, kind=None, days=_TOP_LOC_PERIOD): """ Return a list of recent locations along with the list of articles in which they are mentioned """ with SessionContext(read_only=True) as session: q = (session.query( Location.name, Location.kind, Location.country, Location.article_url, Location.latitude, Location.longitude, Article.id, Article.heading, Root.domain, ).join(Article, Article.url == Location.article_url).filter( Article.timestamp > datetime.utcnow() - timedelta(days=days)).join(Root).filter(Root.visible)) # Filter by kind if kind: q = q.filter(Location.kind == kind) q = q.order_by(desc(Article.timestamp)) # Group articles by unique location locs = defaultdict(list) for r in q.all(): article = { "url": r.article_url, "id": r.id, "heading": r.heading, "domain": r.domain, } k = (r.name, r.kind, r.country, r.latitude, r.longitude) locs[k].append(article) # Create top locations list sorted by article count loclist = [] for k, v in locs.items(): (name, kind, country, lat, lon) = k # Unpack tuple key # Google map links currently use the placename instead of # coordinates. This works well for most Icelandic and # international placenames, but fails on some. map_url = GMAPS_PLACE_URL.format(name) # if lat and lon: # map_url = GMAPS_COORD_URL.format(lat, lon, "7z") loclist.append({ "name": name, "kind": kind, "country": country, "map_url": map_url, "articles": v, }) loclist.sort(key=lambda x: len(x["articles"]), reverse=True) return loclist[:limit]
def graph_data(num_persons=_DEFAULT_NUM_PERSONS_GRAPH): """ Get and prepare data for people graph """ with SessionContext(read_only=True) as session: # Find all persons mentioned in articles that # have at least two names (i.e. match whitespace) q = (session.query(Word.stem, Word.article_id, Word.cat).filter( Word.cat.like("person_%")).filter(Word.stem.like("% %"))) res = q.all() # Count number of occurrences of each name cnt = Counter() for name, _, _ in res: cnt[name] += 1 # Get most common names names = [name for name, freq in cnt.most_common(num_persons)] # Generate dict mapping article ids to a set of top names mentioned articles = defaultdict(set) for name, art_id, _ in res: if name in names: articles[art_id].add(name) # Find all links nlinks = defaultdict(int) for a_id, persons in articles.items(): if len(persons) < 2: # We need at least two names to establish link continue # Find all permutations of people mentioned in article perm = list(permutations(persons, 2)) for a, b in perm: # We use a sorted tuple as hashable dict key when # counting number of connections between any two names k = tuple(sorted([names.index(a), names.index(b)])) nlinks[k] += 1 # Create final link and node data structures links = [{ "source": k[0], "target": k[1], "weight": v } for k, v in nlinks.items()] nodes = [] for idx, n in enumerate(names): # print(cnt[n]) # TODO: Normalize influence nodes.append({ "name": n, "id": idx, "influence": cnt[n] / 7, "zone": 0 }) dataset = {"nodes": nodes, "links": links} return dataset
def news(): """ Handler for a page with a list of articles + pagination """ topic = request.args.get("topic") root = request.args.get("root") author = request.args.get("author") try: offset = max(0, int(request.args.get("offset", 0))) limit = max(0, int(request.args.get("limit", _DEFAULT_NUM_ARTICLES))) except: offset = 0 limit = _DEFAULT_NUM_ARTICLES limit = min(limit, _MAX_NUM_ARTICLES) # Cap at max 100 results per page with SessionContext(read_only=True) as session: # Fetch articles articles = fetch_articles( topic=topic, offset=offset, limit=limit, root=root, author=author, enclosing_session=session, ) # If all articles in the list are timestamped within 24 hours of now, # we display their times in HH:MM format. Otherwise, we display full date. display_time = True if articles and (datetime.utcnow() - articles[-1].timestamp).days >= 1: display_time = False # Fetch lists of article topics q = session.query(Topic.identifier, Topic.name).order_by(Topic.name).all() d = {t[0]: t[1] for t in q} topics = dict(id=topic, name=d.get(topic, ""), topic_list=q) # Fetch list of article sources (roots) q = ( session.query(Root.domain, Root.description) .filter(Root.visible == True) .order_by(Root.description) ) roots = dict(q.all()) return render_template( "news.html", title="Fréttir", articles=articles, topics=topics, display_time=display_time, offset=offset, limit=limit, selected_root=root, roots=roots, author=author, )
def stats(): """ Render a page containing various statistics from the Greynir database. """ days = _DEFAULT_STATS_PERIOD try: days = min(_MAX_STATS_PERIOD, int(request.args.get("days", _DEFAULT_STATS_PERIOD))) except Exception: pass chart_data: Dict[str, Any] = dict() with SessionContext(read_only=True) as session: # Article stats sq = StatsQuery() result = sq.execute(session) total = dict(art=Decimal(), sent=Decimal(), parsed=Decimal()) for r in result: total["art"] += r.art total["sent"] += r.sent total["parsed"] += r.parsed # Gender stats gq = GenderQuery() gresult = gq.execute(session) gtotal = dict(kvk=Decimal(), kk=Decimal(), hk=Decimal(), total=Decimal()) for r in gresult: gtotal["kvk"] += r.kvk gtotal["kk"] += r.kk gtotal["hk"] += r.hk gtotal["total"] += r.kvk + r.kk + r.hk # Author stats authresult = top_authors(session=session) # Chart stats chart_data = chart_stats(session=session, num_days=days) return render_template( "stats.html", title="Tölfræði", result=result, total=total, gresult=gresult, gtotal=gtotal, authresult=authresult, scraped_chart_data=json.dumps(chart_data["scraped"]), parsed_chart_data=json.dumps(chart_data["parsed"]), queries_chart_data=json.dumps(chart_data["queries"]), scraped_avg=int(round(chart_data["scraped"]["avg"])), parsed_avg=round(chart_data["parsed"]["avg"], 1), queries_avg=round(chart_data["queries"]["avg"], 1), )
def scrape_from_url(cls, url, enclosing_session=None): """ Force fetch of an article, given its URL """ with SessionContext(enclosing_session) as session: ar = session.query(ArticleRow).filter(ArticleRow.url == url).one_or_none() a = cls._init_from_scrape(url, session) if a is not None and ar is not None: # This article already existed in the database, so note its UUID a._uuid = ar.id return a
def main(argv=None): """ Guido van Rossum's pattern for a Python main function """ if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt( argv[1:], "hirl:u:", ["help", "init", "reparse", "limit=", "urls="] ) except getopt.error as msg: raise Usage(msg) init = False # !!! DEBUG default limit on number of articles to parse, unless otherwise specified limit = 10 reparse = False urls = None # Process options for o, a in opts: if o in ("-h", "--help"): print(__doc__) sys.exit(0) elif o in ("-i", "--init"): init = True elif o in ("-r", "--reparse"): reparse = True elif o in ("-l", "--limit"): # Maximum number of articles to parse try: limit = int(a) except ValueError: pass elif o in ("-u", "--urls"): urls = a # Text file with list of URLs # Process arguments for _ in args: pass # Set logging format logging.basicConfig( format="%(asctime)s %(levelname)s:%(message)s", level=logging.INFO ) # Read the configuration settings file try: Settings.read("config/Reynir.conf") # Don't run the scraper in debug mode Settings.DEBUG = False except ConfigError as e: print("Configuration error: {0}".format(e), file=sys.stderr) return 2 if init: # Initialize the scraper database init_roots() else: # Run the scraper scrape_articles(reparse=reparse, limit=limit, urls=urls) except Usage as err: print(err.msg, file=sys.stderr) print("For help use --help", file=sys.stderr) return 2 finally: SessionContext.cleanup() Article.cleanup() # Completed with no error return 0