def find_url_in_extra_data(self): """ DB structure is a mess! There is no convention where the url associated with an event is. Thu we need to look for it in different places NOTE: This can be solved by creating a new column called url and write the url only there returns: url if found or None otherwise """ if self.extra_data and self.extra_data != "{}" and self.extra_data != "null": try: extra_event_data = json.loads(self.extra_data) if "articleURL" in extra_event_data: url = extra_event_data["articleURL"] elif "url" in extra_event_data: url = extra_event_data["url"] else: # There is no url return None return Url.extract_canonical_url(url) except: # Some json strings are truncated and some other times extra_event_data is an int # therefore cannot be parsed correctly and throw an exception return None else: # The extra_data field is empty return None
def find_or_create(cls, session, user: User, _url, _title: str, _language): """ create a new object and add it to the db if it's not already there otherwise retrieve the existing object and update in case of creation, the created object is incomplete \ """ language = Language.find(_language) url = Url.find_or_create(session, _url, _title) try: return cls.query.filter_by(user=user, url=url).one() except NoResultFound: try: new = cls(user, url, _title, language) session.add(new) session.commit() return new except Exception as e: from sentry_sdk import capture_exception capture_exception(e) print("seems we avoided a race condition") session.rollback() return cls.query.filter_by(user=user, url=url).one()
def test_find_or_create_works(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def delete(cls, session, user, _url): try: url = Url.find(_url) item = cls.query.filter_by(user=user, url=url).one() session.delete(item) session.commit() except Exception as e: from sentry_sdk import capture_exception capture_exception(e)
def _localized_topic_keyword_in_url(self, topic: str, localized: str, keyword: str, url: str): topic = Topic(topic) localized_topic = LocalizedTopic(topic, self.user.learned_language, localized) localized_topic.keywords = keyword article = ArticleRule().article url = Url.find_or_create(self.db.session, url) article.url = url assert localized_topic.matches_article(article)
def delete(cls, session, user, _url): try: url = Url.find(_url) item = cls.query.filter_by( user=user, url=url ).one() session.delete(item) session.commit() except Exception as e: print(e)
def test_domain_plus_path_must_be_unique(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title _domain = DomainName.get_domain(_url) with self.assertRaises(Exception) as context: domain = DomainName.find(_domain) url = Url(_url, _title, domain) session.add(url) session.commit() self.assertTrue('Duplicate entry' or 'IntegrityError' in str(context.exception))
def __init__(self): super().__init__() self.rss_feed = self._create_model_object() self.feed = self.rss_feed self.save(self.rss_feed) lang1 = Language.find_or_create('de') url = Url.find_or_create(self.db.session, url_spiegel_rss) self.feed1 = RSSFeed.find_or_create(self.db.session, url, "", "", icon_name_spiegel, language=lang1) self.save(self.feed1)
def upload_articles(cohort_id): """ uploads articles for a cohort with input from a POST request """ if not has_permission_for_cohort(cohort_id): flask.abort(401) try: for article_data in json.loads(request.data): url = Url("userarticle/{}".format(uuid.uuid4().hex)) title = article_data["title"] authors = article_data["authors"] content = article_data["content"] summary = article_data["summary"] published_time = datetime.now() language_code = article_data["language_code"] language = Language.find(language_code) new_article = Article( url, title, authors, content, summary, published_time, None, # rss feed language, ) db.session.add(new_article) db.session.flush() db.session.refresh(new_article) cohort = Cohort.find(cohort_id) new_cohort_article_map = CohortArticleMap(cohort, new_article) db.session.add(new_cohort_article_map) db.session.commit() return "OK" except ValueError: flask.abort(400) return "ValueError"
def test_try_to_get_race_condition(self): _url = self.url_rule.url.as_string() _title = self.url_rule.url.title def threaded_create_url(): url = Url.find_or_create(session, _url, _title) threads = [] for i in range(0): # multithreaded connections freeze on mysqldb. # so this is here to be tested manually and killed for now... t = Thread(target=threaded_create_url, args=()) threads.append(t) t.start() for t in threads: t.join() url = Url.find_or_create(session, _url, _title) self.assertEqual(url.title, _title)
def upload_articles(cohort_id): ''' uploads articles for a cohort with input from a POST request ''' if (not has_permission_for_cohort(cohort_id)): flask.abort(401) try: for article_data in json.loads(request.data): url = Url('userarticle/{}'.format(uuid.uuid4().hex)) title = article_data['title'] authors = article_data['authors'] content = article_data['content'] summary = article_data['summary'] published_time = datetime.now() language_code = article_data['language_code'] language = Language.find(language_code) new_article = Article( url, title, authors, content, summary, published_time, None, # rss feed language) db.session.add(new_article) db.session.flush() db.session.refresh(new_article) cohort = Cohort.find(cohort_id) new_cohort_article_map = CohortArticleMap(cohort, new_article) db.session.add(new_cohort_article_map) db.session.commit() return 'OK' except ValueError: flask.abort(400) return 'ValueError'
def download_from_feed(feed: RSSFeed, session, limit=1000): """ Session is needed because this saves stuff to the DB. last_crawled_time is useful because otherwise there would be a lot of time wasted trying to retrieve the same articles, especially the ones which can't be retrieved, so they won't be cached. """ log(feed.title) downloaded = 0 skipped = 0 skipped_due_to_low_quality = dict() skipped_already_in_db = 0 last_retrieval_time_from_DB = None last_retrieval_time_seen_this_crawl = None if feed.last_crawled_time: last_retrieval_time_from_DB = feed.last_crawled_time log(f"last retrieval time from DB = {last_retrieval_time_from_DB}") try: items = feed.feed_items() except: log("Failed to connect to feed") return for feed_item in items: if downloaded >= limit: break try: url = _url_after_redirects(feed_item['url']) except requests.exceptions.TooManyRedirects: log(f"Too many redirects for: {url}") continue try: this_article_time = datetime.strptime(feed_item['published'], SIMPLE_TIME_FORMAT) this_article_time = this_article_time.replace(tzinfo=None) except: log(f"can't get time from {url}: {feed_item['published']}") continue if _date_in_the_future(this_article_time): log("article from the future...") continue if last_retrieval_time_from_DB: if this_article_time < last_retrieval_time_from_DB: skipped += 1 continue title = feed_item['title'] summary = feed_item['summary'] log(url) try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] log(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) continue if (not last_retrieval_time_seen_this_crawl) or ( this_article_time > last_retrieval_time_seen_this_crawl): last_retrieval_time_seen_this_crawl = this_article_time if art: skipped_already_in_db += 1 log("- already in db") else: try: art = newspaper.Article(url) art.download() art.parse() log("- succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) quality_article = sufficient_quality( art, skipped_due_to_low_quality) if quality_article: from zeeguu_core.language.difficulty_estimator_factory import DifficultyEstimatorFactory try: # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, this_article_time, feed, feed.language) session.add(new_article) session.commit() downloaded += 1 add_topics(new_article, session) log("- added topics") add_searches(title, url, new_article, session) log("- added keywords") session.commit() if last_retrieval_time_seen_this_crawl: feed.last_crawled_time = last_retrieval_time_seen_this_crawl session.add(feed) except Exception as e: log(f'Something went wrong when creating article and attaching words/topics: {e}' ) log("rolling back the session... ") session.rollback() except Exception as e: # raise e import sys ex = sys.exc_info()[0] log(f"Failed to create zeeguu.Article from {url}\n{str(ex)}") log(f' Skipped due to time: {skipped} ') log(f' Downloaded: {downloaded}') log(f' Low Quality: {skipped_due_to_low_quality}') log(f' Already in DB: {skipped_already_in_db}')
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item['url']) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception(f"- Could not get url after redirects for {feed_item['url']}") title = feed_item['title'] summary = feed_item['summary'] published_datetime = feed_item['published_datetime'] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}") if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language ) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}") session.rollback() return new_article
test_feed = test_feed(_feed_url) feed_name = input(f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ') or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session, feed_url, feed_name, description, icon_name=icon_name, language=language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string())
def threaded_create_url(): url = Url.find_or_create(session, _url, _title)
#!/usr/bin/env python from zeeguu_core.model import RSSFeed, Url, Language import zeeguu_core RESOURCES_FOLDER = "https://zeeguu.unibe.ch/api/resources/" name = input ("Name of feed to update: ") session = zeeguu_core.db.session all_feeds = RSSFeed.query.all() for feed in all_feeds: if feed.title == name: print("Updating ... " + name) feed.title = input (f'Title ({feed.title}): ') or feed.title print (f'new title is: {feed.title}') _image_url = input ('Icon file: ') feed.image_url = Url.find_or_create(session, RESOURCES_FOLDER+_image_url) print ('new image url: ' + feed.image_url.as_string()) session.add(feed) session.commit()
feed_name = input( f"Feed name (Enter for: {test_feed.title}): ") or test_feed.title print(f'= {feed_name}') icon_name = input( "Icon name to be found in resources folder (e.g. 20min.png): ") print(f'= {icon_name}') description = input(f'Description (Enter for: {test_feed.description}): ' ) or test_feed.description print(f'= {description}') _language = input("Language code (e.g. en): ") print(f'= {_language}') feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url) language = Language.find_or_create(_language) rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session, feed_url, feed_name, description, icon_name=icon_name, language=language) print("Done: ") print(rss_feed.title) print(rss_feed.description) print(rss_feed.language_id) print(rss_feed.url.as_string())
def download_feed_item(session, feed, feed_item): new_article = None try: url = _url_after_redirects(feed_item['url']) log(url) except requests.exceptions.TooManyRedirects: raise Exception(f"- Too many redirects") except Exception: raise Exception( f"- Could not get url after redirects for {feed_item['url']}") title = feed_item['title'] published_datetime = feed_item['published_datetime'] try: art = model.Article.find(url) except: import sys ex = sys.exc_info()[0] raise Exception( f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}" ) if art: raise SkippedAlreadyInDB() try: art = newspaper.Article(url) art.download() art.parse() debug("- Succesfully parsed") cleaned_up_text = cleanup_non_content_bits(art.text) cleaned_up_text = flatten_composed_unicode_characters(cleaned_up_text) is_quality_article, reason = sufficient_quality(art) if not is_quality_article: raise SkippedForLowQuality(reason) summary = feed_item['summary'] # however, this is not so easy... there have been cases where # the summary is just malformed HTML... thus we try to extract # the text: from bs4 import BeautifulSoup soup = BeautifulSoup(summary, "lxml") summary = soup.get_text() # then there are cases where the summary is huge... so we clip it summary = summary[:MAX_CHAR_COUNT_IN_SUMMARY] # and if there is still no summary, we simply use the beginning of # the article if len(summary) < 10: summary = cleaned_up_text[:MAX_CHAR_COUNT_IN_SUMMARY] # Create new article and save it to DB new_article = zeeguu_core.model.Article( Url.find_or_create(session, url), title, ', '.join(art.authors), cleaned_up_text, summary, published_datetime, feed, feed.language) session.add(new_article) topics = add_topics(new_article, session) log(f" Topics ({topics})") add_searches(title, url, new_article, session) debug(" Added keywords") session.commit() log(f"SUCCESS for: {new_article.title}") except SkippedForLowQuality as e: raise e except Exception as e: from sentry_sdk import capture_exception capture_exception(e) log(f"* Rolling back session due to exception while creating article and attaching words/topics: {str(e)}" ) session.rollback() return new_article