def _create_model_object(self):
        title = " ".join(self.faker.text().split()[:(randint(1, 10))])
        description = " ".join(self.faker.text().split()[:(randint(5, 20))])
        language = LanguageRule().random
        url = UrlRule().url
        image_url = UrlRule().url
        icon_name = self.faker.name()

        new_rss_feed = RSSFeed(url, title, description, image_url, icon_name,
                               language)

        if RSSFeed.exists(new_rss_feed):
            return self._create_model_object()

        return new_rss_feed
Beispiel #2
0
def get_interesting_feeds_for_language_code(language_code):
    """
    Get a list of feeds for the given language

    :return:
    """
    feed_data = []
    for feed in RSSFeed.find_for_language_id(language_code):
        feed_data.append(feed.as_dictionary())
    return json_result(feed_data)
Beispiel #3
0
def test_feed(url: str):
    feed = RSSFeed.from_url(url)

    feed_items = feed.feed_items()
    if not feed_items:
        print("Feed seems broken. No items found.")
    else:
        count = len(feed_items)
        print(f"Feed seems healthy: {count} items found. ")

    return feed
Beispiel #4
0
def test_feed(url: str):
    feed = RSSFeed.from_url(url)

    feed_items = feed.feed_items()
    if not feed_items:
        print("Feed seems broken. No items found.")
    else:
        count = len(feed_items)
        print(f"Feed seems healthy: {count} items found. ")

    return feed
Beispiel #5
0
def start_following_feed_with_id():
    """
    :param: feed_id -- the id of the feed to be followed.
    Start following the feed with the given id

    :return: "OK" in case of success
    """
    if request.form.get("source_id", ""):
        feed_id = int(request.form.get("source_id", ""))
    else:
        feed_id = int(request.form.get("feed_id", ""))

    feed_object = RSSFeed.find_by_id(feed_id)
    RSSFeedRegistration.find_or_create(session, flask.g.user, feed_object)

    return "OK"
    def __init__(self):
        super().__init__()

        self.rss_feed = self._create_model_object()
        self.feed = self.rss_feed
        self.save(self.rss_feed)

        lang1 = Language.find_or_create('de')
        url = Url.find_or_create(self.db.session, url_spiegel_rss)

        self.feed1 = RSSFeed.find_or_create(self.db.session,
                                            url,
                                            "",
                                            "",
                                            icon_name_spiegel,
                                            language=lang1)
        self.save(self.feed1)
Beispiel #7
0
def get_non_subscribed_feeds(language_code):
    """
    Get a list of feeds for the given language

    :return:
    """
    feed_data = []
    already_registered = [
        each.rss_feed
        for each in RSSFeedRegistration.feeds_for_user(flask.g.user)
    ]

    all_available_for_language = RSSFeed.find_for_language_id(language_code)

    for feed in all_available_for_language:
        if not (feed in already_registered):
            feed_data.append(feed.as_dictionary())

    return json_result(feed_data)
def download_from_feed(feed: RSSFeed, session, limit=1000):
    """

        Session is needed because this saves stuff to the DB.


        last_crawled_time is useful because otherwise there would be a lot of time
        wasted trying to retrieve the same articles, especially the ones which
        can't be retrieved, so they won't be cached.


    """
    log(feed.title)

    downloaded = 0
    skipped = 0
    skipped_due_to_low_quality = dict()
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"last retrieval time from DB = {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items()
    except:
        log("Failed to connect to feed")
        return

    for feed_item in items:

        if downloaded >= limit:
            break

        try:
            url = _url_after_redirects(feed_item['url'])
        except requests.exceptions.TooManyRedirects:
            log(f"Too many redirects for: {url}")
            continue

        try:
            this_article_time = datetime.strptime(feed_item['published'],
                                                  SIMPLE_TIME_FORMAT)
            this_article_time = this_article_time.replace(tzinfo=None)
        except:
            log(f"can't get time from {url}: {feed_item['published']}")
            continue

        if _date_in_the_future(this_article_time):
            log("article from the future...")
            continue

        if last_retrieval_time_from_DB:

            if this_article_time < last_retrieval_time_from_DB:
                skipped += 1
                continue

        title = feed_item['title']
        summary = feed_item['summary']

        log(url)

        try:
            art = model.Article.find(url)
        except:
            import sys
            ex = sys.exc_info()[0]
            log(f" {LOG_CONTEXT}: For some reason excepted during Article.find \n{str(ex)}"
                )
            continue

        if (not last_retrieval_time_seen_this_crawl) or (
                this_article_time > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = this_article_time

        if art:
            skipped_already_in_db += 1
            log("- already in db")
        else:
            try:

                art = newspaper.Article(url)
                art.download()
                art.parse()
                log("- succesfully parsed")

                cleaned_up_text = cleanup_non_content_bits(art.text)

                quality_article = sufficient_quality(
                    art, skipped_due_to_low_quality)
                if quality_article:
                    from zeeguu_core.language.difficulty_estimator_factory import DifficultyEstimatorFactory

                    try:
                        # Create new article and save it to DB
                        new_article = zeeguu_core.model.Article(
                            Url.find_or_create(session, url), title,
                            ', '.join(art.authors), cleaned_up_text, summary,
                            this_article_time, feed, feed.language)
                        session.add(new_article)
                        session.commit()
                        downloaded += 1

                        add_topics(new_article, session)
                        log("- added topics")
                        add_searches(title, url, new_article, session)
                        log("- added keywords")
                        session.commit()

                        if last_retrieval_time_seen_this_crawl:
                            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
                        session.add(feed)

                    except Exception as e:
                        log(f'Something went wrong when creating article and attaching words/topics: {e}'
                            )
                        log("rolling back the session... ")
                        session.rollback()

            except Exception as e:
                # raise e
                import sys
                ex = sys.exc_info()[0]
                log(f"Failed to create zeeguu.Article from {url}\n{str(ex)}")

    log(f'  Skipped due to time: {skipped} ')
    log(f'  Downloaded: {downloaded}')
    log(f'  Low Quality: {skipped_due_to_low_quality}')
    log(f'  Already in DB: {skipped_already_in_db}')
def download_from_feed(feed: RSSFeed, session, limit=1000, save_in_elastic=True):
    """

        Session is needed because this saves stuff to the DB.


        last_crawled_time is useful because otherwise there would be a lot of time
        wasted trying to retrieve the same articles, especially the ones which
        can't be retrieved, so they won't be cached.


    """

    downloaded = 0
    skipped_due_to_low_quality = 0
    skipped_already_in_db = 0

    last_retrieval_time_from_DB = None
    last_retrieval_time_seen_this_crawl = None

    if feed.last_crawled_time:
        last_retrieval_time_from_DB = feed.last_crawled_time
        log(f"LAST CRAWLED::: {last_retrieval_time_from_DB}")

    try:
        items = feed.feed_items(last_retrieval_time_from_DB)
    except Exception as e:
        log(f"Failed to download feed ({e})")
        return

    for feed_item in items:

        skipped_already_in_db = 0

        if downloaded >= limit:
            break

        feed_item_timestamp = feed_item['published_datetime']

        if _date_in_the_future(feed_item_timestamp):
            log("Article from the future!")
            continue

        if (not last_retrieval_time_seen_this_crawl) or (feed_item_timestamp > last_retrieval_time_seen_this_crawl):
            last_retrieval_time_seen_this_crawl = feed_item_timestamp

        if last_retrieval_time_seen_this_crawl > feed.last_crawled_time:
            feed.last_crawled_time = last_retrieval_time_seen_this_crawl
            log(f"+updated feed's last crawled time to {last_retrieval_time_seen_this_crawl}")

        session.add(feed)
        session.commit()

        try:
            new_article = download_feed_item(session,
                                             feed,
                                             feed_item)
            downloaded += 1
        except SkippedForTooOld:
            log("- Article too old")
            continue
        except SkippedForLowQuality as e:
            log(f" - Low quality: {e.reason}")
            skipped_due_to_low_quality += 1
            continue
        except SkippedAlreadyInDB:
            skipped_already_in_db += 1
            log(" - Already in DB")
            continue

        except Exception as e:
            if hasattr(e, 'message'):
                log(e.message)
            else:
                log(e)
            continue

        # Saves the news article at ElasticSearch.
        # We recommend that everything is stored both in SQL and Elasticsearch
        # as ElasticSearch isn't persistent data
        try:
            if save_in_elastic:
                if new_article:
                    es = Elasticsearch(ES_CONN_STRING)
                    doc = document_from_article(new_article, session)
                    res = es.index(index=ES_ZINDEX, id=new_article.id, body=doc)
                    print("elastic res: " + res['result'])
        except Exception as e:
            log("***OOPS***: ElasticSearch seems down?")
            if hasattr(e, 'message'):
                log(e.message)
            else:
                log(e)
            continue

    log(f'*** Downloaded: {downloaded} From: {feed.title}')
    log(f'*** Low Quality: {skipped_due_to_low_quality}')
    log(f'*** Already in DB: {skipped_already_in_db}')
    log(f'*** ')
Beispiel #10
0
test_feed = test_feed(_feed_url)

feed_name = input(f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): ') or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())
Beispiel #11
0
 def _exists_in_db(obj):
     return RSSFeed.exists(obj)
Beispiel #12
0
feed_name = input(
    f"Feed name (Enter for: {test_feed.title}):  ") or test_feed.title
print(f'= {feed_name}')

icon_name = input(
    "Icon name to be found in resources folder (e.g. 20min.png):  ")
print(f'= {icon_name}')

description = input(f'Description (Enter for: {test_feed.description}): '
                    ) or test_feed.description
print(f'= {description}')

_language = input("Language code (e.g. en): ")
print(f'= {_language}')

feed_url = Url.find_or_create(zeeguu_core.db.session, _feed_url)
language = Language.find_or_create(_language)

rss_feed = RSSFeed.find_or_create(zeeguu_core.db.session,
                                  feed_url,
                                  feed_name,
                                  description,
                                  icon_name=icon_name,
                                  language=language)

print("Done: ")
print(rss_feed.title)
print(rss_feed.description)
print(rss_feed.language_id)
print(rss_feed.url.as_string())