Ejemplo n.º 1
0
    def sync(self):
        feed = self.feed
        extractor = FeedExtractor(feed.channel_url)
        extractor.parse(etag=feed.last_etag_header, modified=feed.last_modified_header)
        article_models = []
        if extractor.is_modified:
            for article in extractor.articles:
                try:
                    if article.published_date:
                        from rdr.components.timezone import convert_to_local
                        published_date = convert_to_local(article.published_date)
                    else:
                        raise Exception('Empty published date')
                    if feed.last_update and published_date < feed.last_update:
                        continue
                    text = article.safe_text
                    title = article.title
                    import hashlib
                    s1 = hashlib.sha1()
                    check_string = (title + ' | ' + published_date.strftime('%Y-%m-%d %H:%M:%S')) \
                        .encode('utf-8')
                    s1.update(check_string)
                    hash_ = s1.hexdigest()
                    saved_article = Article.query.filter((Article.feed_id == feed.id) & (Article.hash == hash_)).first()
                    if saved_article is not None:
                        continue
                    fetched_date = DateTime.now()
                    article_model = Article(title=article.title,
                                            article_url=article.url,
                                            feed_id=feed.id,
                                            preview_text=html.nl2br(text),
                                            active=True,
                                            published=published_date,
                                            fetched=fetched_date,
                                            hash=hash_)
                    image_url = article.primary_image_url
                    if image_url:
                        if http.check_is_not_local_url(image_url) and http.check_is_absolute_url(image_url):
                            article_model.preview_image_src = image_url
                    article_models.append(article_model)
                    db.session.add(article_model)
                    db.session.commit()
                    if self.is_add_to_search_index:
                        from rdr.modules.feeds.search import ArticleSearchIndex
                        search_provider = ArticleSearchIndex(article_model)
                        search_provider.create_index()
                except Exception as e:
                    app.logger.exception(e)

            if not feed.active:
                feed.active = True
            feed.last_etag_header = extractor.etag_header
            feed.last_modified_header = extractor.modified_header
            feed.last_update = DateTime.now()
            db.session.commit()

        return article_models
Ejemplo n.º 2
0
 def fetch_images(self, obj):
     result = []
     if 'images' in obj:
         for img in obj['images']:
             if 'url' in img and img['url']:
                 if http.check_is_absolute_url(img['url']) and http.check_is_not_local_url(img['url']):
                     result.append({
                         'primary': img.get('primary', False),
                         'src': img['url']
                     })
     return result
Ejemplo n.º 3
0
    def fetch_article_full_text(self, article):
        from newspaper import Article as NewspaperArticle
        from newspaper.utils import get_available_languages
        feed = article.feed
        if not feed:
            raise Exception('Can\'t fetch full text for article without feed')
        lang = feed.language
        supported_langs = get_available_languages()
        if not lang or lang not in supported_langs:
            app.logger.warning('%s not in newspaper languages list' % lang)
            lang = None
            if article.preview_text:
                try:
                    from langdetect import detect
                    lang = detect(article.preview_text)
                    if lang not in supported_langs:
                        lang = None
                except ImportError:
                    pass
                except Exception as e:
                    app.logger.exception(e)
            if not lang:
                lang = 'en'

        np_article = NewspaperArticle(article.article_url,
                                      language=lang,
                                      browser_user_agent=app.config.get('DEFAULT_USER_AGENT'),
                                      keep_article_html=True)
        np_article.download()
        np_article.parse()
        images = []
        top_image_url = np_article.top_image
        if top_image_url:
            if http.check_is_absolute_url(top_image_url) and http.check_is_not_local_url(top_image_url):
                images.append({
                    'src': top_image_url,
                    'primary': True
                })
        return ArticleFullTextResult(title=np_article.title,
                                     text=np_article.text,
                                     images=images)
Ejemplo n.º 4
0
 def is_url(self, query):
     return http.check_is_url(query) and http.check_is_not_local_url(query)