Beispiel #1
0
 def test_evil_html(self):
     ugly_html = (
         """<script>rm -Rf /</script><style>color: black</style><b onclick="doEvil()">"""
         """Oh</b><iframe src="ops"><frame><frameset>"""
     )
     pretty_html = get_sanitized_html(ugly_html)
     self.assertEqual(pretty_html, """<b>\n Oh\n</b>""")
Beispiel #2
0
 def test_safe_attrs(self):
     ugly_html = """<a href="bla">Hi <i>italic</i></a><img src="ble" onload="muahaha()">"""
     pretty_html = get_sanitized_html(ugly_html)
     self.assertEqual(pretty_html, """<a href="bla">\n Hi\n <i>\n  italic\n </i>\n</a>\n<img src="ble"/>\n""")
Beispiel #3
0
 def test_prettified_html(self):
     ugly_html = """<p>Hello <b>world!</p>"""
     pretty_html = get_sanitized_html(ugly_html)
     self.assertEqual(pretty_html, """<p>\n Hello\n <b>\n  world!\n </b>\n</p>""")
Beispiel #4
0
def update_site_feed(feed, site_id):
    '''This functions handles the feed update of site and is kind of recursive,
    since in the end it will call another apply_async onto himself'''
    from feeds.models import Post, Site
    from feeds.utils import get_sanitized_html
    # Avoids running two instances at the time
    # Update task_id for this site
    site = Site.objects.get(id=site_id)
    site.task_id = update_site_feed.request.id
    site.save()

    # Update this site info
    if 'feed' not in feed:
        logger.warn(u"Site {} feed did not returned feed information".format(site.id))
        if 'feed_error' in feed:
            logger.error('Site {} is with its feed url broken'.format(site.id))
            # TODO: Create a task to use site.url to discover its new feed location
            site.feed_errors += 1
            site.save()
    else:
        info = feed['feed']
        if 'title' in info:
            site.title = info['title']

        # For some reason, some Google Alerts returns a not valid FQDN info after parsed
        # and then we must check if it starts with "http"
        if 'link' in info and info['link'].startswith('http'):
            site.url = info['link']
        if site.feed_errors > 0:
            site.feed_errors = 0
        site.save()

    # Create posts
    if 'entries' not in feed:
        logger.warn(u"Site {} feed did not returned any post".format(site.id))
    else:
        new_posts_found = 0
        for entry in feed['entries']:
            # Without link we can't save this post
            if 'link' not in entry:
                continue
            url = entry['link']
            title = entry.get('title', '')

            # Try to get content
            if isinstance(entry.get('content'), list):
                try:
                    for content in entry['content']:
                        if content:
                            break
                except IndexError:
                    content = u''
            else:
                content = entry.get('content')

            if not content and 'description' in entry:
                content = entry['description']

            if isinstance(content, dict):
                content = content.get('value')

            # Still no content found, lets try using summary
            if not content and entry.get('summary'):
                content = entry['summary']

            # Parses the content to avoid broken HTML and script tags
            content = get_sanitized_html(content)

            author = entry.get('author')

            if 'published_parsed' in entry and entry.get('published_parsed'):
                created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                try:
                    created_at = make_aware(created_at, get_current_timezone())
                except AmbiguousTimeError:
                    logger.error('Failed when tring to make {} aware'.format(created_at))
                    created_at = timezone.now()
            else:
                created_at = timezone.now()

            try:
                post, created = site.posts.get_or_create(
                    url_hash=Post.hashurl(url),
                    defaults={
                        'title': title,
                        'url': url,
                        'content': content,
                        'author': author,
                        'created_at': created_at
                    }
                )
            except IntegrityError:
                # Raised when two posts have the same URL
                logger.warn('Final URL {} is duplicated'.format(url))
                pass
            else:
                if created:
                    new_posts_found += 1

        logger.info(
            'Site {site_id} got {new} new posts from {total} in feed'.format(
                site_id=site.id,
                new=new_posts_found,
                total=len(feed['entries'])
            )
        )

    # Updates when is it to run again
    next_update = site.set_next_update(save=False)
    logger.info("Site's {} next update at {}".format(site.id, next_update))
    site.last_update = timezone.now()
    site.save()