def distill_article_interactions(session, user, data): """ extracts info from user_activity_data :param session: :param event: :param value: :param user: """ event = data['event'] value = data['value'] article_id = int(data['article_id']) log(f'event is: {event}') if "UMR - OPEN ARTICLE" in event: article_opened(session, article_id, user) elif "UMR - LIKE ARTICLE" in event: article_liked(session, article_id, user, True) elif "UMR - UNLIKE ARTICLE" in event: article_liked(session, article_id, user, False) elif "UMR - USER FEEDBACK" in event: article_feedback(session, article_id, user, value)
def article_liked(session, article_id, user, like_value): from zeeguu.core.emailer.user_activity import send_notification_article_feedback article = Article.query.filter_by(id=article_id).one() ua = UserArticle.find(user, article) ua.liked = like_value session.add(ua) session.commit() log(f"{ua}") send_notification_article_feedback('Liked', user, article.title, article.url.as_string(), article.id)
def article_search_for_user(user, count, search_terms): try: return elastic_article_search_for_user(user, count, search_terms) except elasticsearch.exceptions.ConnectionError: log(ES_DOWN_MESSAGE) log(print(traceback.format_exc())) return mixed_article_search_for_user(user, count, search_terms)
def article_opened(session, article_id, user): article = Article.query.filter_by(id=article_id).one() ua = UserArticle.find(user, article) if not ua: ua = UserArticle.find_or_create(session, user, article, opened=datetime.now()) ua.opened = datetime.now() session.add(ua) session.commit() log(f"{ua}")
def retrieve_articles_from_all_feeds(): counter = 0 all_feeds = RSSFeed.query.all() all_feeds_count = len(all_feeds) for feed in all_feeds: counter += 1 try: msg = f"*** >>>>>>>>> {feed.title} ({counter}/{all_feeds_count}) <<<<<<<<<< " # .encode('utf-8') log("") log(f"{msg}") download_from_feed(feed, zeeguu.core.db.session) except Exception as e: traceback.print_exc()
def download_from_feed(feed: RSSFeed, session, limit=1000, save_in_elastic=True): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ downloaded = 0 skipped_due_to_low_quality = 0 skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time log(f"LAST CRAWLED::: {last_retrieval_time_from_DB}") try: items = feed.feed_items(last_retrieval_time_from_DB) except Exception as e: log(f"Failed to download feed ({e})") from sentry_sdk import capture_exception capture_exception(e) return for feed_item in items: skipped_already_in_db = 0 if downloaded >= limit: break feed_item_timestamp = feed_item["published_datetime"] if _date_in_the_future(feed_item_timestamp): log("Article from the future!") continue if (not last_retrieval_time_seen_this_crawl) or ( feed_item_timestamp > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = feed_item_timestamp if last_retrieval_time_seen_this_crawl > feed.last_crawled_time: feed.last_crawled_time = last_retrieval_time_seen_this_crawl log(f"+updated feed's last crawled time to {last_retrieval_time_seen_this_crawl}" ) session.add(feed) session.commit() try: new_article = download_feed_item(session, feed, feed_item) downloaded += 1 except SkippedForTooOld: log("- Article too old") continue except SkippedForLowQuality as e: log(f" - Low quality: {e.reason}") skipped_due_to_low_quality += 1 continue except SkippedAlreadyInDB: skipped_already_in_db += 1 log(" - Already in DB") continue except Exception as e: from sentry_sdk import capture_exception capture_exception(e) if hasattr(e, "message"): log(e.message) else: log(e) continue # Saves the news article at ElasticSearch. # We recommend that everything is stored both in SQL and Elasticsearch # as ElasticSearch isn't persistent data try: if save_in_elastic: if new_article: es = Elasticsearch(ES_CONN_STRING) doc = document_from_article(new_article, session) res = es.index(index=ES_ZINDEX, id=new_article.id, body=doc) print("elastic res: " + res["result"]) except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log("***OOPS***: ElasticSearch seems down?") if hasattr(e, "message"): log(e.message) else: log(e) continue log(f"*** Downloaded: {downloaded} From: {feed.title}") log(f"*** Low Quality: {skipped_due_to_low_quality}") log(f"*** Already in DB: {skipped_already_in_db}") log(f"*** ")
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item["url"]) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception( f"- Could not get url after redirects for {feed_item['url']}") title = feed_item["title"] published_datetime = feed_item["published_datetime"] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception( f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) summary = feed_item["summary"] # however, this is not so easy... there have been cases where # the summary is just malformed HTML... thus we try to extract # the text: from bs4 import BeautifulSoup soup = BeautifulSoup(summary, "lxml") summary = soup.get_text() # then there are cases where the summary is huge... so we clip it summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY] # and if there is still no summary, we simply use the beginning of # the article if len(summary) < 10: summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY] # Create new article and save it to DB new_article = zeeguu.core.model.Article( Url.find_or_create(session, url), title, ", ".join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language, ) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}" ) session.rollback() return new_article