Beispiel #1
0
def add_article(source_id, entry):
    if FeedArticle.query.filter_by(link=entry.link, source_id=source_id).first():
        return

    summary = BeautifulSoup(entry.summary, 'lxml').get_text()
    article = FeedArticle(
        link=entry.link,
        title=entry.title,
        summary=summary,
        source_id=source_id,
        html=entry.summary)

    if 'media_thumbnail' in entry:
        article.thumbnail_url = entry['media_thumbnail'][0]['url']

    if not article.thumbnail_url and 'links' in entry:
        links = entry['links']
        for link in links:
            if 'type' in link and link['type'].startswith('image'):
                if 'href' in link:
                    article.thumbnail_url = link['href']
                    break

    if article.summary and not article.thumbnail_url:
        article.thumbnail_url = get_thumbnail_url_from_summary(article.html)

    cdb.session.add(article)
    cdb.session.commit()
Beispiel #2
0
def update_db():
    """
    Updatedb steps:
        1. fetch_entries
            [(src_id, entry)]
        2. fetch_article
            [
                article:
                    title
                    link
                    summary
                    summary_stemmed

                    readable -> html
                    thumbnail_url
            ]
        3. add_to_db
    """

    # 1. Fetch all entries
    t0 = time.time()
    results = fetch_all_entries()
    t1 = time.time()
    print 'Fetching entries takes %.3fs' % (t1 - t0)
    print 'Num entries %d' % (len(results))

    # 2. Fetch articles
    for result in results:
        source_id, entry = result

        if not FeedArticle.query.filter_by(link=entry.link, source_id=source_id).first():
            summary = BeautifulSoup(entry.summary, 'lxml').get_text()
            article = FeedArticle(
                link=entry.link,
                title=entry.title,
                summary=summary,
                source_id=source_id,
                html=entry.summary
            )

            if 'media_thumbnail' in entry:
                article.thumbnail_url = entry['media_thumbnail'][0]['url']

            if not article.thumbnail_url and 'links' in entry:
                links = entry['links']
                for link in links:
                    if 'type' in link and link['type'].startswith('image'):
                        if 'href' in link:
                            article.thumbnail_url = link['href']
                            break

            # chain = fetch_html.s(entry.link) |          \
            #         get_readable.s(entry.link) |        \
            #         add_article.s(article)
            # chain.apply_async()

            add_article.delay(article)