Beispiel #1
0
def backfill_articles_for_ml():
    count = 500
    put_batch = []
    for article in Article.query().order(-Article.added_date).iter(count):
        if not article.ml_service_time and not article.processed_by_ml_service:
            article.ml_service_time = util.datetime_from_timestamp(0)
            put_batch.append(article)
        if len(put_batch) > 50:
            ndb.put_multi(put_batch)
            put_batch = []
    if len(put_batch): ndb.put_multi(put_batch)
Beispiel #2
0
def bookmarks(uid, since=None):
    q = Bookmark.query(Bookmark.uid == uid).order(-Bookmark.last_modified)
    if since: q = q.filter(Bookmark.last_modified >= util.datetime_from_timestamp(since))
    bookmarks = q.fetch(200)
    articles = ndb.get_multi([b.article for b in bookmarks])
    def to_json(bookmark, article):
        j = bookmark.json()
        j['article'] = article.json() if article else None
        return j
    return {
        "bookmarks": [to_json(bookmark, article) for bookmark, article in zip(bookmarks, articles)],
        "since": util.datetime_to_timestamp(datetime.datetime.now()),
        "partial": since is not None
    }
def article_fetch(article, force_mercury=False):
    if article.content:
        content = article.content.get()
    else:
        content = ArticleContent()
        content.put()
        print 'KEY', content.key
        article.content = content.key
    
    def make_url_absolute(url):
        return urljoin(article.url, url) if url else None
    
    FORCE_AMP = False
    if FORCE_AMP:
        url = article.amp_url or article.url
    else:
        url = article.url
    DEFAULT_TO_MERCURY = True
    
    def fetch_normal():
        response = url_fetch(url, return_response_obj=True)
        # print 'INFO', response.info()
        if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html':
            markup = response.read()
        else:
            print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE'
            markup = None
    
        if markup:
            # process markup:
            markup_soup = BeautifulSoup(markup, 'lxml')
            og_title = find_meta_value(markup_soup, 'og:title')
            og_image = find_meta_value(markup_soup, 'og:image')
            og_description = find_meta_value(markup_soup, 'og:description')
            title_field = find_title(markup_soup)
        
            article.site_name = find_meta_value(markup_soup, 'og:site_name')
        
            # find author:
            article.author = find_author(markup_soup)
        
            # parse and process article content:
            content.html = article_extractor.extract(markup, article.url)
            doc_soup = BeautifulSoup(content.html, 'lxml')
        
            article.title = first_present([og_title, title_field, article.title])
            article.top_image = make_url_absolute(first_present([article.top_image, og_image]))
        
            populate_article_json(article, content)
        
            # compute description:
            description = None
            if og_description and len(og_description.strip()):
                description = truncate(og_description.strip(), words=40)
            elif content.text and len(content.text.strip()) > 0:
                description = truncate(content.text, words=40)
            article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None
                
            return True
        else:
            return False
    
    def fetch_mercury():
        merc = mercury.fetch(article.url)
        if merc and len(merc.get('content') or "") >= 50:
            article.title = merc['title']
            article.top_image = merc['lead_image_url']
            if merc['date_published'] and not article.published:
                pass # TODO
            article.author = merc['author']
            content.html = merc['content']
            if not article.description:
                article.description = merc['excerpt']
            populate_article_json(article, content)
            return True
        else:
            return False
    
    if (force_mercury or DEFAULT_TO_MERCURY) and fetch_mercury():
        article.fetch_failed = False
        print "Successfully fetched {0} via mercury".format(url)
    else:
        article.fetch_failed = not fetch_normal()
    
    article.fetch_date = datetime.datetime.now()
    article.ml_service_time = util.datetime_from_timestamp(0) # mark this article as ready to be consumed by the ml service
    content.put()
    article.put()
    if article.source: article.source.get().invalidate_cache()
Beispiel #4
0
def do_stragglers(rules, unowned_entries):
    rules_by_text = defaultdict(set)
    rules_by_number = defaultdict(set)
    for rule in rules:
        for entry in rule.entries:
            rules_by_text[entry.normalized_text].add(rule)
        for number in rule.numbers:
            rules_by_number[number].add(rule)
    unowned_by_number_and_text = defaultdict(lambda: defaultdict(set))
    for entry in unowned_entries:
        unowned_by_number_and_text[entry.data['number']][
            entry.normalized_text].add(entry)
    for number, by_text in unowned_by_number_and_text.items():
        nrules = rules_by_number[number]
        if len(nrules) == 0 or number in {1741}:
            # no history for this rule at all; just assume they're all one rule
            # rule 1741: the unanchored entry is a different rule from the anchored one
            new_rule = Rule()
            new_rule.numbers.add(number)
            for entries in by_text.values():
                new_rule.entries.extend(entries)
            rules.add(new_rule)
            continue
        for normalized_text, entries in by_text.items():
            trules = nrules
            if len(trules) > 1:
                trules = trules.intersection(rules_by_text[normalized_text])
            for entry in entries:
                drules = trules
                edate = entry.meta.get('date')
                if edate is not None and number not in {430}:
                    # rule 430: zefram_rules_text says "ca. Sep. 13 1993",
                    # but it was published on Sep. 8
                    edate = util.datetime_from_timestamp(edate).date()
                    drules = [
                        rule for rule in trules
                        if not rule.definitely_created_after(edate)
                    ]
                drules = list(drules)
                if len(drules) == 1:
                    rule = next(iter(drules))
                    rule.entries.append(entry)
                else:
                    with warnx():
                        print('could not match entry (and copies) to rule:')
                        print(next(iter(entries)))
                        print('date:', entry.date())
                        for i, rule in enumerate(drules):
                            print('***** candidate %d/%d:' %
                                  (i + 1, len(drules)))
                            print(rule)
                            for oentry in rule.entries:
                                print('--')
                                print(oentry)
                                #for an in oentry.ans: print(an)
                        if not drules:
                            print(
                                '***** no candidates! (%d by number alone, but enacted too late)'
                                % (len(nrules), ))
                        print('====')
                    break
Beispiel #5
0
 def date(self):
     ts = self.meta.get('date')
     if ts is None:
         return None
     else:
         return util.datetime_from_timestamp(ts)