def backfill_articles_for_ml(): count = 500 put_batch = [] for article in Article.query().order(-Article.added_date).iter(count): if not article.ml_service_time and not article.processed_by_ml_service: article.ml_service_time = util.datetime_from_timestamp(0) put_batch.append(article) if len(put_batch) > 50: ndb.put_multi(put_batch) put_batch = [] if len(put_batch): ndb.put_multi(put_batch)
def bookmarks(uid, since=None): q = Bookmark.query(Bookmark.uid == uid).order(-Bookmark.last_modified) if since: q = q.filter(Bookmark.last_modified >= util.datetime_from_timestamp(since)) bookmarks = q.fetch(200) articles = ndb.get_multi([b.article for b in bookmarks]) def to_json(bookmark, article): j = bookmark.json() j['article'] = article.json() if article else None return j return { "bookmarks": [to_json(bookmark, article) for bookmark, article in zip(bookmarks, articles)], "since": util.datetime_to_timestamp(datetime.datetime.now()), "partial": since is not None }
def article_fetch(article, force_mercury=False): if article.content: content = article.content.get() else: content = ArticleContent() content.put() print 'KEY', content.key article.content = content.key def make_url_absolute(url): return urljoin(article.url, url) if url else None FORCE_AMP = False if FORCE_AMP: url = article.amp_url or article.url else: url = article.url DEFAULT_TO_MERCURY = True def fetch_normal(): response = url_fetch(url, return_response_obj=True) # print 'INFO', response.info() if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html': markup = response.read() else: print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE' markup = None if markup: # process markup: markup_soup = BeautifulSoup(markup, 'lxml') og_title = find_meta_value(markup_soup, 'og:title') og_image = find_meta_value(markup_soup, 'og:image') og_description = find_meta_value(markup_soup, 'og:description') title_field = find_title(markup_soup) article.site_name = find_meta_value(markup_soup, 'og:site_name') # find author: article.author = find_author(markup_soup) # parse and process article content: content.html = article_extractor.extract(markup, article.url) doc_soup = BeautifulSoup(content.html, 'lxml') article.title = first_present([og_title, title_field, article.title]) article.top_image = make_url_absolute(first_present([article.top_image, og_image])) populate_article_json(article, content) # compute description: description = None if og_description and len(og_description.strip()): description = truncate(og_description.strip(), words=40) elif content.text and len(content.text.strip()) > 0: description = truncate(content.text, words=40) article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None return True else: return False def fetch_mercury(): merc = mercury.fetch(article.url) if merc and len(merc.get('content') or "") >= 50: article.title = merc['title'] article.top_image = merc['lead_image_url'] if merc['date_published'] and not article.published: pass # TODO article.author = merc['author'] content.html = merc['content'] if not article.description: article.description = merc['excerpt'] populate_article_json(article, content) return True else: return False if (force_mercury or DEFAULT_TO_MERCURY) and fetch_mercury(): article.fetch_failed = False print "Successfully fetched {0} via mercury".format(url) else: article.fetch_failed = not fetch_normal() article.fetch_date = datetime.datetime.now() article.ml_service_time = util.datetime_from_timestamp(0) # mark this article as ready to be consumed by the ml service content.put() article.put() if article.source: article.source.get().invalidate_cache()
def do_stragglers(rules, unowned_entries): rules_by_text = defaultdict(set) rules_by_number = defaultdict(set) for rule in rules: for entry in rule.entries: rules_by_text[entry.normalized_text].add(rule) for number in rule.numbers: rules_by_number[number].add(rule) unowned_by_number_and_text = defaultdict(lambda: defaultdict(set)) for entry in unowned_entries: unowned_by_number_and_text[entry.data['number']][ entry.normalized_text].add(entry) for number, by_text in unowned_by_number_and_text.items(): nrules = rules_by_number[number] if len(nrules) == 0 or number in {1741}: # no history for this rule at all; just assume they're all one rule # rule 1741: the unanchored entry is a different rule from the anchored one new_rule = Rule() new_rule.numbers.add(number) for entries in by_text.values(): new_rule.entries.extend(entries) rules.add(new_rule) continue for normalized_text, entries in by_text.items(): trules = nrules if len(trules) > 1: trules = trules.intersection(rules_by_text[normalized_text]) for entry in entries: drules = trules edate = entry.meta.get('date') if edate is not None and number not in {430}: # rule 430: zefram_rules_text says "ca. Sep. 13 1993", # but it was published on Sep. 8 edate = util.datetime_from_timestamp(edate).date() drules = [ rule for rule in trules if not rule.definitely_created_after(edate) ] drules = list(drules) if len(drules) == 1: rule = next(iter(drules)) rule.entries.append(entry) else: with warnx(): print('could not match entry (and copies) to rule:') print(next(iter(entries))) print('date:', entry.date()) for i, rule in enumerate(drules): print('***** candidate %d/%d:' % (i + 1, len(drules))) print(rule) for oentry in rule.entries: print('--') print(oentry) #for an in oentry.ans: print(an) if not drules: print( '***** no candidates! (%d by number alone, but enacted too late)' % (len(nrules), )) print('====') break
def date(self): ts = self.meta.get('date') if ts is None: return None else: return util.datetime_from_timestamp(ts)