def _create_model_object(self): title = " ".join(self.faker.text().split()[:(randint(1, 10))]) description = " ".join(self.faker.text().split()[:(randint(5, 20))]) language = LanguageRule().random url = UrlRule().url image_url = UrlRule().url icon_name = self.faker.name() new_rss_feed = RSSFeed(url, title, description, image_url, icon_name, language) if RSSFeed.exists(new_rss_feed): return self._create_model_object() return new_rss_feed
def get_interesting_feeds_for_language_code(language_code): """ Get a list of feeds for the given language :return: """ feed_data = [] for feed in RSSFeed.find_for_language_id(language_code): feed_data.append(feed.as_dictionary()) return json_result(feed_data)
def test_feed(url: str): feed = RSSFeed.from_url(url) feed_items = feed.feed_items() if not feed_items: print("Feed seems broken. No items found.") else: count = len(feed_items) print(f"Feed seems healthy: {count} items found. ") return feed
def __init__(self): super().__init__() self.rss_feed = self._create_model_object() self.feed = self.rss_feed self.save(self.rss_feed) lang1 = Language.find_or_create('de') url = Url.find_or_create(self.db.session, url_spiegel_rss) self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", icon_name_spiegel, language=lang1) self.save(self.feed1)
def start_following_feed_with_id(): """ :param: feed_id -- the id of the feed to be followed. Start following the feed with the given id :return: "OK" in case of success """ if request.form.get("source_id", ""): feed_id = int(request.form.get("source_id", "")) else: feed_id = int(request.form.get("feed_id", "")) feed_object = RSSFeed.find_by_id(feed_id) RSSFeedRegistration.find_or_create(session, flask.g.user, feed_object) return "OK"
def _exists_in_db(obj): return RSSFeed.exists(obj)
feed_name = input( f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ' ) or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') feed_url = Url.find_or_create(zeeguu.core.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu.core.db.session, feed_url, feed_name, description, icon_name=icon_name, language=language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string())
def download_from_feed(feed: RSSFeed, session, limit=1000, save_in_elastic=True): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ downloaded = 0 skipped_due_to_low_quality = 0 skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time log(f"LAST CRAWLED::: {last_retrieval_time_from_DB}") try: items = feed.feed_items(last_retrieval_time_from_DB) except Exception as e: log(f"Failed to download feed ({e})") from sentry_sdk import capture_exception capture_exception(e) return for feed_item in items: skipped_already_in_db = 0 if downloaded >= limit: break feed_item_timestamp = feed_item["published_datetime"] if _date_in_the_future(feed_item_timestamp): log("Article from the future!") continue if (not last_retrieval_time_seen_this_crawl) or ( feed_item_timestamp > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = feed_item_timestamp if last_retrieval_time_seen_this_crawl > feed.last_crawled_time: feed.last_crawled_time = last_retrieval_time_seen_this_crawl log(f"+updated feed's last crawled time to {last_retrieval_time_seen_this_crawl}" ) session.add(feed) session.commit() try: new_article = download_feed_item(session, feed, feed_item) downloaded += 1 except SkippedForTooOld: log("- Article too old") continue except SkippedForLowQuality as e: log(f" - Low quality: {e.reason}") skipped_due_to_low_quality += 1 continue except SkippedAlreadyInDB: skipped_already_in_db += 1 log(" - Already in DB") continue except Exception as e: from sentry_sdk import capture_exception capture_exception(e) if hasattr(e, "message"): log(e.message) else: log(e) continue # Saves the news article at ElasticSearch. # We recommend that everything is stored both in SQL and Elasticsearch # as ElasticSearch isn't persistent data try: if save_in_elastic: if new_article: es = Elasticsearch(ES_CONN_STRING) doc = document_from_article(new_article, session) res = es.index(index=ES_ZINDEX, id=new_article.id, body=doc) print("elastic res: " + res["result"]) except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log("***OOPS***: ElasticSearch seems down?") if hasattr(e, "message"): log(e.message) else: log(e) continue log(f"*** Downloaded: {downloaded} From: {feed.title}") log(f"*** Low Quality: {skipped_due_to_low_quality}") log(f"*** Already in DB: {skipped_already_in_db}") log(f"*** ")