コード例 #1
0
    def test_le_monde_subscription(self):

        art = newspaper.Article(url_vols_americans)
        art.download()
        art.parse()

        assert (not sufficient_quality(art, {}))
コード例 #2
0
    def test_new_scientist_overlay(self):

        art = newspaper.Article(url_fish_will_be_gone)
        art.download()
        art.parse()

        assert (not sufficient_quality(art, {}))
コード例 #3
0
    def test_sufficient_quality(self):

        art = newspaper.Article(url_investing_in_index_funds)
        art.download()
        art.parse()

        assert (sufficient_quality(art, {}))
コード例 #4
0
def download_from_feed(feed: RSSFeed, session, limit=1000):
    """

        Session is needed because this saves stuff to the DB.


        last_crawled_time is useful because otherwise there would be a lot of time
        wasted trying to retrieve the same articles, especially the ones which
        can't be retrieved, so they won't be cached.


    """
    log(feed.title)

    downloaded = 0
    skipped = 0
    skipped_due_to_low_quality = dict()
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"last retrieval time from DB = {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items()
    except:
        log("Failed to connect to feed")
        return

    for feed_item in items:

        if downloaded >= limit:
            break

        try:
            url = _url_after_redirects(feed_item['url'])
        except requests.exceptions.TooManyRedirects:
            log(f"Too many redirects for: {url}")
            continue

        try:
            this_article_time = datetime.strptime(feed_item['published'],
                                                  SIMPLE_TIME_FORMAT)
            this_article_time = this_article_time.replace(tzinfo=None)
        except:
            log(f"can't get time from {url}: {feed_item['published']}")
            continue

        if _date_in_the_future(this_article_time):
            log("article from the future...")
            continue

        if last_retrieval_time_from_DB:

            if this_article_time < last_retrieval_time_from_DB:
                skipped += 1
                continue

        title = feed_item['title']
        summary = feed_item['summary']

        log(url)

        try:
            art = model.Article.find(url)
        except:
            import sys
            ex = sys.exc_info()[0]
            log(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
                )
            continue

        if (not last_retrieval_time_seen_this_crawl) or (
                this_article_time > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = this_article_time

        if art:
            skipped_already_in_db += 1
            log("- already in db")
        else:
            try:

                art = newspaper.Article(url)
                art.download()
                art.parse()
                log("- succesfully parsed")

                cleaned_up_text = cleanup_non_content_bits(art.text)

                quality_article = sufficient_quality(
                    art, skipped_due_to_low_quality)
                if quality_article:
                    from zeeguu_core.language.difficulty_estimator_factory import DifficultyEstimatorFactory

                    try:
                        # Create new article and save it to DB
                        new_article = zeeguu_core.model.Article(
                            Url.find_or_create(session, url), title,
                            ', '.join(art.authors), cleaned_up_text, summary,
                            this_article_time, feed, feed.language)
                        session.add(new_article)
                        session.commit()
                        downloaded += 1

                        add_topics(new_article, session)
                        log("- added topics")
                        add_searches(title, url, new_article, session)
                        log("- added keywords")
                        session.commit()

                        if last_retrieval_time_seen_this_crawl:
                            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
                        session.add(feed)

                    except Exception as e:
                        log(f'Something went wrong when creating article and attaching words/topics: {e}'
                            )
                        log("rolling back the session... ")
                        session.rollback()

            except Exception as e:
                # raise e
                import sys
                ex = sys.exc_info()[0]
                log(f"Failed to create zeeguu.Article from {url}\n{str(ex)}")

    log(f'  Skipped due to time: {skipped} ')
    log(f'  Downloaded: {downloaded}')
    log(f'  Low Quality: {skipped_due_to_low_quality}')
    log(f'  Already in DB: {skipped_already_in_db}')
コード例 #5
0
def download_feed_item(session,
                       feed,
                       feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item['url'])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item['title']
    summary = feed_item['summary']
    published_datetime = feed_item['published_datetime']

    try:
        art = model.Article.find(url)
    except:
        import sys
        ex = sys.exc_info()[0]
        raise Exception(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}")

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        # Create new article and save it to DB
        new_article = zeeguu_core.model.Article(
            Url.find_or_create(session, url),
            title,
            ', '.join(art.authors),
            cleaned_up_text,
            summary,
            published_datetime,
            feed,
            feed.language
        )
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}")
        session.rollback()

    return new_article
コード例 #6
0
def download_feed_item(session, feed, feed_item):
    new_article = None

    try:

        url = _url_after_redirects(feed_item['url'])
        log(url)

    except requests.exceptions.TooManyRedirects:
        raise Exception(f"- Too many redirects")
    except Exception:
        raise Exception(
            f"- Could not get url after redirects for {feed_item['url']}")

    title = feed_item['title']

    published_datetime = feed_item['published_datetime']

    try:
        art = model.Article.find(url)
    except:
        import sys
        ex = sys.exc_info()[0]
        raise Exception(
            f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
        )

    if art:
        raise SkippedAlreadyInDB()

    try:

        art = newspaper.Article(url)
        art.download()
        art.parse()

        debug("- Succesfully parsed")

        cleaned_up_text = cleanup_non_content_bits(art.text)

        cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text)

        is_quality_article, reason = sufficient_quality(art)

        if not is_quality_article:
            raise SkippedForLowQuality(reason)

        summary = feed_item['summary']
        # however, this is not so easy... there have been cases where
        # the summary is just malformed HTML... thus we try to extract
        # the text:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(summary, "lxml")
        summary = soup.get_text()
        # then there are cases where the summary is huge... so we clip it
        summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY]
        # and if there is still no summary, we simply use the beginning of
        # the article
        if len(summary) < 10:
            summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY]

            # Create new article and save it to DB
        new_article = zeeguu_core.model.Article(
            Url.find_or_create(session, url), title, ', '.join(art.authors),
            cleaned_up_text, summary, published_datetime, feed, feed.language)
        session.add(new_article)

        topics = add_topics(new_article, session)
        log(f" Topics ({topics})")

        add_searches(title, url, new_article, session)
        debug(" Added keywords")

        session.commit()
        log(f"SUCCESS for: {new_article.title}")

    except SkippedForLowQuality as e:
        raise e

    except Exception as e:
        from sentry_sdk import capture_exception
        capture_exception(e)

        log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}"
            )
        session.rollback()

    return new_article