Example #1
0
    def _create_model_object(self):
        title = " ".join(self.faker.text().split()[:(randint(1, 10))])
        description = " ".join(self.faker.text().split()[:(randint(5, 20))])
        language = LanguageRule().random
        url = UrlRule().url
        image_url = UrlRule().url
        icon_name = self.faker.name()

        new_rss_feed = RSSFeed(url, title, description, image_url, icon_name, language)

        if RSSFeed.exists(new_rss_feed):
            return self._create_model_object()

        return new_rss_feed
Example #2
0
def get_interesting_feeds_for_language_code(language_code):
    """
    Get a list of feeds for the given language

    :return:
    """
    feed_data = []
    for feed in RSSFeed.find_for_language_id(language_code):
        feed_data.append(feed.as_dictionary())
    return json_result(feed_data)
Example #3
0
def test_feed(url: str):
    feed = RSSFeed.from_url(url)

    feed_items = feed.feed_items()
    if not feed_items:
        print("Feed seems broken. No items found.")
    else:
        count = len(feed_items)
        print(f"Feed seems healthy: {count} items found. ")

    return feed
Example #4
0
    def __init__(self):
        super().__init__()

        self.rss_feed = self._create_model_object()
        self.feed = self.rss_feed
        self.save(self.rss_feed)

        lang1 = Language.find_or_create('de')
        url = Url.find_or_create(self.db.session, url_spiegel_rss)

        self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", icon_name_spiegel,
                                            language=lang1)
        self.save(self.feed1)
Example #5
0
def start_following_feed_with_id():
    """
    :param: feed_id -- the id of the feed to be followed.
    Start following the feed with the given id

    :return: "OK" in case of success
    """
    if request.form.get("source_id", ""):
        feed_id = int(request.form.get("source_id", ""))
    else:
        feed_id = int(request.form.get("feed_id", ""))

    feed_object = RSSFeed.find_by_id(feed_id)
    RSSFeedRegistration.find_or_create(session, flask.g.user, feed_object)

    return "OK"
Example #6
0
 def _exists_in_db(obj):
     return RSSFeed.exists(obj)
Example #7
0
feed_name = input(
    f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): '
                    ) or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu.core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu.core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
def download_from_feed(feed: RSSFeed,
                       session,
                       limit=1000,
                       save_in_elastic=True):
    """

    Session is needed because this saves stuff to the DB.


    last_crawled_time is useful because otherwise there would be a lot of time
    wasted trying to retrieve the same articles, especially the ones which
    can't be retrieved, so they won't be cached.


    """

    downloaded = 0
    skipped_due_to_low_quality = 0
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"LAST CRAWLED::: {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items(last_retrieval_time_from_DB)
    except Exception as e:
        log(f"Failed to download feed ({e})")
        from sentry_sdk import capture_exception

        capture_exception(e)
        return

    for feed_item in items:

        skipped_already_in_db = 0

        if downloaded >= limit:
            break

        feed_item_timestamp = feed_item["published_datetime"]

        if _date_in_the_future(feed_item_timestamp):
            log("Article from the future!")
            continue

        if (not last_retrieval_time_seen_this_crawl) or (
                feed_item_timestamp > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = feed_item_timestamp

        if last_retrieval_time_seen_this_crawl > feed.last_crawled_time:
            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
            log(f"+updated feed's last crawled time to {last_retrieval_time_seen_this_crawl}"
                )

        session.add(feed)
        session.commit()

        try:
            new_article = download_feed_item(session, feed, feed_item)
            downloaded += 1
        except SkippedForTooOld:
            log("- Article too old")
            continue
        except SkippedForLowQuality as e:
            log(f" - Low quality: {e.reason}")
            skipped_due_to_low_quality += 1
            continue
        except SkippedAlreadyInDB:
            skipped_already_in_db += 1
            log(" - Already in DB")
            continue

        except Exception as e:
            from sentry_sdk import capture_exception

            capture_exception(e)

            if hasattr(e, "message"):
                log(e.message)
            else:
                log(e)
            continue

        # Saves the news article at ElasticSearch.
        # We recommend that everything is stored both in SQL and Elasticsearch
        # as ElasticSearch isn't persistent data
        try:
            if save_in_elastic:
                if new_article:
                    es = Elasticsearch(ES_CONN_STRING)
                    doc = document_from_article(new_article, session)
                    res = es.index(index=ES_ZINDEX,
                                   id=new_article.id,
                                   body=doc)
                    print("elastic res: " + res["result"])
        except Exception as e:
            from sentry_sdk import capture_exception

            capture_exception(e)

            log("***OOPS***: ElasticSearch seems down?")
            if hasattr(e, "message"):
                log(e.message)
            else:
                log(e)
            continue

    log(f"*** Downloaded: {downloaded} From: {feed.title}")
    log(f"*** Low Quality: {skipped_due_to_low_quality}")
    log(f"*** Already in DB: {skipped_already_in_db}")
    log(f"*** ")