def add_article(source_id, entry): if FeedArticle.query.filter_by(link=entry.link, source_id=source_id).first(): return summary = BeautifulSoup(entry.summary, 'lxml').get_text() article = FeedArticle( link=entry.link, title=entry.title, summary=summary, source_id=source_id, html=entry.summary) if 'media_thumbnail' in entry: article.thumbnail_url = entry['media_thumbnail'][0]['url'] if not article.thumbnail_url and 'links' in entry: links = entry['links'] for link in links: if 'type' in link and link['type'].startswith('image'): if 'href' in link: article.thumbnail_url = link['href'] break if article.summary and not article.thumbnail_url: article.thumbnail_url = get_thumbnail_url_from_summary(article.html) cdb.session.add(article) cdb.session.commit()
def update_db(): """ Updatedb steps: 1. fetch_entries [(src_id, entry)] 2. fetch_article [ article: title link summary summary_stemmed readable -> html thumbnail_url ] 3. add_to_db """ # 1. Fetch all entries t0 = time.time() results = fetch_all_entries() t1 = time.time() print 'Fetching entries takes %.3fs' % (t1 - t0) print 'Num entries %d' % (len(results)) # 2. Fetch articles for result in results: source_id, entry = result if not FeedArticle.query.filter_by(link=entry.link, source_id=source_id).first(): summary = BeautifulSoup(entry.summary, 'lxml').get_text() article = FeedArticle( link=entry.link, title=entry.title, summary=summary, source_id=source_id, html=entry.summary ) if 'media_thumbnail' in entry: article.thumbnail_url = entry['media_thumbnail'][0]['url'] if not article.thumbnail_url and 'links' in entry: links = entry['links'] for link in links: if 'type' in link and link['type'].startswith('image'): if 'href' in link: article.thumbnail_url = link['href'] break # chain = fetch_html.s(entry.link) | \ # get_readable.s(entry.link) | \ # add_article.s(article) # chain.apply_async() add_article.delay(article)