def source_fetch(source): debug("SF: Doing fetch for source: {0}".format(source.url)) result = _source_fetch(source) debug("SF: Done with source fetch for {0}; result type: {1}".format(source.url, (result.method if result else None))) added_any = False now = datetime.datetime.now() to_put = [] tasks_to_enqueue = [] if result: if result.feed_title: source.title = result.feed_title if result.brand: source.brand = result.brand titles = [entry['title'] for entry in result.entries if entry['title']] source.shared_title_suffix = shared_suffix(titles) entries = result.entries[:min(25, len(result.entries))] entry_ids = [Article.id_for_article(entry['url'], source.url) for entry in entries] print "ENTRY IDs:", entry_ids print "ENtry id lens: ", str(map(len, entry_ids)) article_futures = [Article.get_or_insert_async(id) for id in entry_ids] articles = [future.get_result() for future in article_futures] print "ARTICLE_OBJECTS:", articles for i, (entry, article) in enumerate(zip(entries, articles)): if not article.url: added_any = True article.added_date = now article.added_order = i article.source = source.key article.url = canonical_url(entry.get('url')) article.submission_url = canonical_url(entry.get('submission_url')) if entry['published']: article.published = entry['published'] else: article.published = datetime.datetime.now() if not article.title: article.title = entry['title'] to_put.append(article) delay = (i+1) * 4 # wait 5 seconds between each tasks_to_enqueue.append(article.create_fetch_task(delay=delay)) debug("SF: About to put {0} items".format(len(to_put))) if len(to_put): ndb.put_multi(to_put) debug("SF: About to enqueue") if len(tasks_to_enqueue): taskqueue.Queue('articles').add_async(tasks_to_enqueue) debug("SF: done enqueuing") if added_any: source.most_recent_article_added_date = now source_search.add_source_to_index(source) source.last_fetched = now source.put()
def post(self): url = canonical_url(self.request.get('url')) source_id = Source.id_for_source(url) source = ndb.Key(Source, source_id).get() while True: articles = Article.query(Article.source == source.key).order(-Article.added_date, Article.added_order).fetch(limit=100, keys_only=True) if len(articles) == 0: break ndb.delete_multi(articles) source.key.delete() self.response.write('Done')
def ensure_source(url, suppress_immediate_fetch=False): url = canonical_url(url) source_id = Source.id_for_source(url) source, inserted = get_or_insert(Source, source_id) if inserted: source.url = url source.put() source.enqueue_fetch() if inserted and not suppress_immediate_fetch: source.fetch_now() return source
def ensure_article_at_url(url, force_fetch=False, force_mercury=False): id = Article.id_for_article(url, None) article, inserted = get_or_insert(Article, id) if inserted: article.added_date = datetime.datetime.now() article.added_order = 0 article.url = canonical_url(url) # article.published = datetime.datetime.now() # article.title = "A test" # article.title = None article.put() if not article.content or force_fetch: article.fetch_now(force_mercury=force_mercury) return article
def create_source_entry_processor(url): url = canonical_url(url) print "SEARCHING FOR SOURCE ENTRY PROCESSOR FOR:", url if url.startswith('http://www.reddit.com') and url.endswith('.rss'): print 'using reddit entry processor' json_url = url[:-len('.rss')] + '.json' api_resp = json.loads(url_fetch(json_url)) url_map = {} for item_ in api_resp['data']['children']: item = item_['data'] submission_url = 'https://www.reddit.com' + item['permalink'] actual_url = item['url'] url_map[submission_url] = actual_url print 'url map: {0}'.format(url_map) def process_reddit(entry, feed_entry): print 'entry url: {0}'.format(entry['url']) submission_url = entry.get('url', entry.get('link')) if submission_url in url_map: print 'MATCHING {0} -> {1}'.format(submission_url, url_map[submission_url]) entry['url'] = url_map[submission_url] entry['submission_url'] = submission_url return process_reddit if url.startswith('http://longform.org/'): def longform_override(result_entry, feed_entry): if 'content' in feed_entry and len(feed_entry['content']) > 0: content = feed_entry['content'][0]['value'] matches = re.findall(r"\"(.+)\"", content) if len(matches): result_entry['url'] = matches[-1] return longform_override if url == 'http://www.designernews.co/?format=atom': def dn_override(result_entry, feed_entry): if 'summary' in feed_entry: result_entry['url'] = feed_entry['url'] return dn_override def process_vanilla(result_entry, feed_entry): pass return process_vanilla
def id_for_source(cls, url): return canonical_url(url)
def id_for_subscription(cls, url, uid): return canonical_url(url) + u" " + uid
def id_for_article(cls, url, source_url): source_string = canonical_url(source_url) if source_url else u"standalone" return canonical_url(url) + u" " + source_string