def source_fetch(source):
    debug("SF: Doing fetch for source: {0}".format(source.url))
    result = _source_fetch(source)
    debug("SF: Done with source fetch for {0}; result type: {1}".format(source.url, (result.method if result else None)))
    added_any = False
    now = datetime.datetime.now()
    to_put = []
    tasks_to_enqueue = []
    if result:
        if result.feed_title:
            source.title = result.feed_title
        if result.brand:
            source.brand = result.brand
        
        titles = [entry['title'] for entry in result.entries if entry['title']]
        source.shared_title_suffix = shared_suffix(titles)
        
        entries = result.entries[:min(25, len(result.entries))]
        entry_ids = [Article.id_for_article(entry['url'], source.url) for entry in entries]
        print "ENTRY IDs:", entry_ids
        print "ENtry id lens: ", str(map(len, entry_ids))
        article_futures = [Article.get_or_insert_async(id) for id in entry_ids]
        articles = [future.get_result() for future in article_futures]
        print "ARTICLE_OBJECTS:", articles
        
        for i, (entry, article) in enumerate(zip(entries, articles)):
            if not article.url:
                added_any = True
                article.added_date = now
                article.added_order = i
                article.source = source.key
                article.url = canonical_url(entry.get('url'))
                article.submission_url = canonical_url(entry.get('submission_url'))
                if entry['published']:
                    article.published = entry['published']
                else:
                    article.published = datetime.datetime.now()
                if not article.title:
                    article.title = entry['title']
                to_put.append(article)
                delay = (i+1) * 4 # wait 5 seconds between each
                tasks_to_enqueue.append(article.create_fetch_task(delay=delay))
    debug("SF: About to put {0} items".format(len(to_put)))
    if len(to_put):
        ndb.put_multi(to_put)
    debug("SF: About to enqueue")
    if len(tasks_to_enqueue):
        taskqueue.Queue('articles').add_async(tasks_to_enqueue)
    debug("SF: done enqueuing")
    if added_any:
        source.most_recent_article_added_date = now
    source_search.add_source_to_index(source)
    source.last_fetched = now
    source.put()
Beispiel #2
0
 def post(self):
     url = canonical_url(self.request.get('url'))
     source_id = Source.id_for_source(url)
     source = ndb.Key(Source, source_id).get()
     while True:
         articles = Article.query(Article.source == source.key).order(-Article.added_date, Article.added_order).fetch(limit=100, keys_only=True)
         if len(articles) == 0: break
         ndb.delete_multi(articles)
     source.key.delete()
     self.response.write('Done')
Beispiel #3
0
def ensure_source(url, suppress_immediate_fetch=False):
    url = canonical_url(url)
    source_id = Source.id_for_source(url)
    source, inserted = get_or_insert(Source, source_id)
    if inserted:
        source.url = url
        source.put()
        source.enqueue_fetch()
    if inserted and not suppress_immediate_fetch:
        source.fetch_now()
    return source
Beispiel #4
0
def ensure_article_at_url(url, force_fetch=False, force_mercury=False):
    id = Article.id_for_article(url, None)
    article, inserted = get_or_insert(Article, id)
    if inserted:
        article.added_date = datetime.datetime.now()
        article.added_order = 0
    article.url = canonical_url(url)
    # article.published = datetime.datetime.now()
    # article.title = "A test"
    # article.title = None
    article.put()
    if not article.content or force_fetch:
        article.fetch_now(force_mercury=force_mercury)
    return article
def create_source_entry_processor(url):
    url = canonical_url(url)
    print "SEARCHING FOR SOURCE ENTRY PROCESSOR FOR:", url
    
    if url.startswith('http://www.reddit.com') and url.endswith('.rss'):
        print 'using reddit entry processor'
        json_url = url[:-len('.rss')] + '.json'
        api_resp = json.loads(url_fetch(json_url))
        url_map = {}
        for item_ in api_resp['data']['children']:
            item = item_['data']
            submission_url = 'https://www.reddit.com' + item['permalink']
            actual_url = item['url']
            url_map[submission_url] = actual_url
        print 'url map: {0}'.format(url_map)
        def process_reddit(entry, feed_entry):
            print 'entry url: {0}'.format(entry['url'])
            submission_url = entry.get('url', entry.get('link'))
            if submission_url in url_map:
                print 'MATCHING {0} -> {1}'.format(submission_url, url_map[submission_url])
                entry['url'] = url_map[submission_url]
                entry['submission_url'] = submission_url
        return process_reddit
    
    if url.startswith('http://longform.org/'):
        def longform_override(result_entry, feed_entry):
            if 'content' in feed_entry and len(feed_entry['content']) > 0:
                content = feed_entry['content'][0]['value']
                matches = re.findall(r"\"(.+)\"", content)
                if len(matches):
                    result_entry['url'] = matches[-1]
        return longform_override
    
    if url == 'http://www.designernews.co/?format=atom':
        def dn_override(result_entry, feed_entry):
            if 'summary' in feed_entry: result_entry['url'] = feed_entry['url']
        return dn_override
    
    def process_vanilla(result_entry, feed_entry):
        pass
    
    return process_vanilla
Beispiel #6
0
 def id_for_source(cls, url):
     return canonical_url(url)
Beispiel #7
0
 def id_for_subscription(cls, url, uid):
     return canonical_url(url) + u" " + uid
Beispiel #8
0
 def id_for_article(cls, url, source_url):
     source_string = canonical_url(source_url) if source_url else u"standalone"
     return canonical_url(url) + u" " + source_string