def test_le_monde_subscription(self): art = newspaper.Article(url_vols_americans) art.download() art.parse() assert (not sufficient_quality(art, {}))
def test_new_scientist_overlay(self): art = newspaper.Article(url_fish_will_be_gone) art.download() art.parse() assert (not sufficient_quality(art, {}))
def test_sufficient_quality(self): art = newspaper.Article(url_investing_in_index_funds) art.download() art.parse() assert (sufficient_quality(art, {}))
def download_from_feed(feed: RSSFeed, session, limit=1000): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ log(feed.title) downloaded = 0 skipped = 0 skipped_due_to_low_quality = dict() skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time log(f"last retrieval time from DB = {last_retrieval_time_from_DB}") try: items = feed.feed_items() except: log("Failed to connect to feed") return for feed_item in items: if downloaded >= limit: break try: url = _url_after_redirects(feed_item['url']) except requests.exceptions.TooManyRedirects: log(f"Too many redirects for: {url}") continue try: this_article_time = datetime.strptime(feed_item['published'], SIMPLE_TIME_FORMAT) this_article_time = this_article_time.replace(tzinfo=None) except: log(f"can't get time from {url}: {feed_item['published']}") continue if _date_in_the_future(this_article_time): log("article from the future...") continue if last_retrieval_time_from_DB: if this_article_time < last_retrieval_time_from_DB: skipped += 1 continue title = feed_item['title'] summary = feed_item['summary'] log(url) try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] log(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) continue if (not last_retrieval_time_seen_this_crawl) or ( this_article_time > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = this_article_time if art: skipped_already_in_db += 1 log("- already in db") else: try: art = newspaper.Article(url) art.download() art.parse() log("- succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) quality_article = sufficient_quality( art, skipped_due_to_low_quality) if quality_article: from zeeguu_core.language.difficulty_estimator_factory import DifficultyEstimatorFactory try: # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, this_article_time, feed, feed.language) session.add(new_article) session.commit() downloaded += 1 add_topics(new_article, session) log("- added topics") add_searches(title, url, new_article, session) log("- added keywords") session.commit() if last_retrieval_time_seen_this_crawl: feed.last_crawled_time = last_retrieval_time_seen_this_crawl session.add(feed) except Exception as e: log(f'Something went wrong when creating article and attaching words/topics: {e}' ) log("rolling back the session... ") session.rollback() except Exception as e: # raise e import sys ex = sys.exc_info()[0] log(f"Failed to create zeeguu.Article from {url}\n{str(ex)}") log(f' Skipped due to time: {skipped} ') log(f' Downloaded: {downloaded}') log(f' Low Quality: {skipped_due_to_low_quality}') log(f' Already in DB: {skipped_already_in_db}')
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item['url']) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception(f"- Could not get url after redirects for {feed_item['url']}") title = feed_item['title'] summary = feed_item['summary'] published_datetime = feed_item['published_datetime'] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}") if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language ) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}") session.rollback() return new_article
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item['url']) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception( f"- Could not get url after redirects for {feed_item['url']}") title = feed_item['title'] published_datetime = feed_item['published_datetime'] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception( f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) summary = feed_item['summary'] # however, this is not so easy... there have been cases where # the summary is just malformed HTML... thus we try to extract # the text: from bs4 import BeautifulSoup soup = BeautifulSoup(summary, "lxml") summary = soup.get_text() # then there are cases where the summary is huge... so we clip it summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY] # and if there is still no summary, we simply use the beginning of # the article if len(summary) < 10: summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY] # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}" ) session.rollback() return new_article