Exemple #1
0
    def handle(self, *args, **options):
        print(join(settings.BASE_DIR, options['file']))

        json_file = open(join(settings.BASE_DIR, options['file']))

        dados = json.load(json_file)

        users = dados['users']

        for u in users:
            u['address'].pop('geo')
            address = Address(**u['address'])
            address.save()

            #user = User()
            #user.username = u['username']
            #user.email = u['email']
            password = '******'

            user = User.objects.create_user(u['username'], u['email'],
                                            password)
            user.address = address
            user.save()

            profile = Profile()
            profile.user = user
            profile.address = address
            profile.save()

        posts = dados['posts']

        for p in posts:
            post = Post()
            post.profile = Profile.objects.get(user=User.objects.get(
                id=p['userId']))
            post.body = p.get('body') or 'sem corpo'
            post.title = p.get('title') or 'sem title'
            post.save()

        comments = dados['comments']
        for c in comments:
            com = Comment()
            com.name = c['name']
            com.email = c['email']
            com.body = c['body']
            com.post = Post.objects.get(pk=c['postId'])
            com.save()
Exemple #2
0
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f["items"]
        if len(entries):
            source_feed.last_success = (
                timezone.now()
            )  # in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = 24 * 3 * 60
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            source_feed.name = update_source_name(source_feed.name, f["title"])
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(parser)
            source_feed.description = parser._sanitizeHTML(
                f["description"], "utf-8", "text/html")

        _customize_sanitizer(parser)
        source_feed.name = update_source_name(
            source_feed.name,
            parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"),
        )

        if "icon" in f:
            source_feed.image_url = f["icon"]

        # output.write(entries)
        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                output.write("EXISTING " + guid + "\n")

            except Exception as ex:
                output.write("NEW " + guid + "\n")
                p = Post(index=0, body=" ")
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(parser)
            body = parser._sanitizeHTML(
                body, "utf-8", "text/html")  # TODO: validate charset ??
            _customize_sanitizer(parser)
            title = parser._sanitizeHTML(
                title, "utf-8", "text/html")  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ""

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                output.write("CREATED ERROR")
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:

                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True

                                try:
                                    ee.length = int(pe["size_in_bytes"])
                                except:
                                    ee.length = 0

                                try:
                                    file_type = pe["mime_type"]
                                except:
                                    file_type = "audio/mpeg"  # we are assuming podcasts here but that's probably not safe

                                ee.type = file_type
                                ee.save()
                                break
                    if not found_enclosure:
                        ee.delete()
                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:

                        try:
                            if pe["url"] not in seen_files:

                                try:
                                    length = int(pe["size_in_bytes"])
                                except:
                                    length = 0

                                try:
                                    filetype = pe["mime_type"]
                                except:
                                    filetype = "audio/mpeg"

                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=filetype)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                if output:
                    output.write("No enclosures - " + str(ex))

            try:
                p.body = body
                p.save()
                # output.write(p.body)
            except Exception as ex:
                output.write(str(ex))
                output.write(p.body)

            try:
                if "tags" in e:
                    for t in e["tags"]:
                        tag, created = Tag.objects.get_or_create(**t)
                        p.tags.add(tag)
                        print(f"Tag {tag} added to post {p}")
            except Exception as ex:
                output.write(str(ex))
                output.write(f"couldn't add tag {tag} to post {p}")

    return (ok, changed)
Exemple #3
0
def parse_feed_xml(source_feed, feed_content, output):

    ok = True
    changed = False

    # output.write(ret.content)
    try:

        _customize_sanitizer(parser)
        f = parser.parse(
            feed_content)  # need to start checking feed parser errors here
        entries = f["entries"]
        if len(entries):
            source_feed.last_success = (
                timezone.now()
            )  # in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        ok = False

    if ok:
        try:
            source_feed.name = update_source_name(source_feed.name,
                                                  f.feed.title)
        except Exception:
            pass

        try:
            source_feed.site_url = f.feed.link
        except Exception:
            pass

        try:
            source_feed.image_url = f.feed.image.href
        except Exception:
            pass

        # either of these is fine, prefer description over summary
        # also feedparser will give us itunes:summary etc if there
        try:
            source_feed.description = f.feed.summary
        except Exception:
            pass

        try:
            source_feed.description = f.feed.description
        except Exception:
            pass

        # output.write(entries)
        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            # we are going to take the longest
            body = ""

            if hasattr(e, "content"):
                for c in e.content:
                    if len(c.value) > len(body):
                        body = c.value

            if hasattr(e, "summary"):
                if len(e.summary) > len(body):
                    body = e.summary

            if hasattr(e, "summary_detail"):
                if len(e.summary_detail.value) > len(body):
                    body = e.summary_detail.value

            if hasattr(e, "description"):
                if len(e.description) > len(body):
                    body = e.description

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e.guid
            except Exception as ex:
                try:
                    guid = e.link
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                output.write("EXISTING " + guid + "\n")

            except Exception as ex:
                output.write("NEW " + guid + "\n")
                p = Post(index=0, body=" ")
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e.title
            except Exception:
                title = ""

            try:
                p.link = e.link
            except Exception:
                p.link = ""
            p.title = title

            try:
                p.image_url = e.image.href
            except Exception:
                pass

            try:
                # If there is no published_parsed entry, try updated_parsed
                if "published_parsed" in e:
                    time_struct = e.published_parsed
                else:
                    time_struct = e.updated_parsed

                p.created = datetime.datetime.fromtimestamp(
                    time.mktime(time_struct)).replace(tzinfo=timezone.utc)

            except Exception:
                output.write("CREATED ERROR")

            p.guid = guid
            try:
                p.author = e.author
            except Exception as ex:
                p.author = ""

            try:
                p.save()
                # output.write(p.body)
            except Exception as ex:
                # import pdb; pdb.set_trace()
                output.write(str(ex))

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    for pe in e["enclosures"]:

                        if pe["href"] == ee.href and ee.href not in seen_files:
                            found_enclosure = True

                            try:
                                ee.length = int(pe["length"])
                            except Exception:
                                ee.length = 0

                            try:
                                file_type = pe["type"]
                            except Exception:
                                file_type = "audio/mpeg"  # we are assuming podcasts here but that's probably not safe

                            ee.type = file_type
                            ee.save()
                            break
                    if not found_enclosure:
                        ee.delete()
                    seen_files.append(ee.href)

                for pe in e["enclosures"]:
                    try:
                        if pe["href"] not in seen_files:

                            try:
                                length = int(pe["length"])
                            except Exception:
                                length = 0

                            try:
                                file_type = pe["type"]
                            except Exception:
                                file_type = "audio/mpeg"

                            ee = Enclosure(post=p,
                                           href=pe["href"],
                                           length=length,
                                           type=file_type)
                            ee.save()
                    except Exception:
                        pass
            except Exception as ex:
                if output:
                    output.write("No enclosures - " + str(ex))

            try:
                p.body = body
                p.save()
                # output.write(p.body)
            except Exception as ex:
                output.write(str(ex))
                output.write(p.body)

            try:
                if "tags" in e:
                    for t in e.tags:
                        tag, created = Tag.objects.get_or_create(**t)
                        p.tags.add(tag)
                        print(f"Tag {tag} added to post {p}")
            except Exception as ex:
                output.write(str(ex))
                output.write(f"couldn't add tag {tag} to post {p}")

    return (ok, changed)
Exemple #4
0
def update_site_feed(feed, site_id):
    '''This functions handles the feed update of site and is kind of recursive,
    since in the end it will call another apply_async onto himself'''
    from feeds.models import Post, Site
    from feeds.utils import get_sanitized_html
    # Avoids running two instances at the time
    # Update task_id for this site
    site = Site.objects.get(id=site_id)
    site.task_id = update_site_feed.request.id
    site.save()

    # Update this site info
    if 'feed' not in feed:
        logger.warn(u"Site {} feed did not returned feed information".format(site.id))
        if 'feed_error' in feed:
            logger.error('Site {} is with its feed url broken'.format(site.id))
            # TODO: Create a task to use site.url to discover its new feed location
            site.feed_errors += 1
            site.save()
    else:
        info = feed['feed']
        if 'title' in info:
            site.title = info['title']

        # For some reason, some Google Alerts returns a not valid FQDN info after parsed
        # and then we must check if it starts with "http"
        if 'link' in info and info['link'].startswith('http'):
            site.url = info['link']
        if site.feed_errors > 0:
            site.feed_errors = 0
        site.save()

    # Create posts
    if 'entries' not in feed:
        logger.warn(u"Site {} feed did not returned any post".format(site.id))
    else:
        new_posts_found = 0
        for entry in feed['entries']:
            # Without link we can't save this post
            if 'link' not in entry:
                continue
            url = entry['link']
            title = entry.get('title', '')

            # Try to get content
            if isinstance(entry.get('content'), list):
                try:
                    for content in entry['content']:
                        if content:
                            break
                except IndexError:
                    content = u''
            else:
                content = entry.get('content')

            if not content and 'description' in entry:
                content = entry['description']

            if isinstance(content, dict):
                content = content.get('value')

            # Still no content found, lets try using summary
            if not content and entry.get('summary'):
                content = entry['summary']

            # Parses the content to avoid broken HTML and script tags
            content = get_sanitized_html(content)

            author = entry.get('author')

            if 'published_parsed' in entry and entry.get('published_parsed'):
                created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                try:
                    created_at = make_aware(created_at, get_current_timezone())
                except AmbiguousTimeError:
                    logger.error('Failed when tring to make {} aware'.format(created_at))
                    created_at = timezone.now()
            else:
                created_at = timezone.now()

            try:
                post, created = site.posts.get_or_create(
                    url_hash=Post.hashurl(url),
                    defaults={
                        'title': title,
                        'url': url,
                        'content': content,
                        'author': author,
                        'created_at': created_at
                    }
                )
            except IntegrityError:
                # Raised when two posts have the same URL
                logger.warn('Final URL {} is duplicated'.format(url))
                pass
            else:
                if created:
                    new_posts_found += 1

        logger.info(
            'Site {site_id} got {new} new posts from {total} in feed'.format(
                site_id=site.id,
                new=new_posts_found,
                total=len(feed['entries'])
            )
        )

    # Updates when is it to run again
    next_update = site.set_next_update(save=False)
    logger.info("Site's {} next update at {}".format(site.id, next_update))
    site.last_update = timezone.now()
    site.save()
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f['items']
        if entries:
            source_feed.last_success = timezone.now(
            )  #in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = (24 * 3 * 60)
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            if not source_feed.name:
                source_feed.name = f["title"]
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(feedparser)
            source_feed.description = feedparser._sanitizeHTML(
                f["description"], "utf-8", 'text/html')

        _customize_sanitizer(feedparser)
        if not source_feed.name:
            source_feed.name = feedparser._sanitizeHTML(
                source_feed.name, "utf-8", 'text/html')

        if "icon" in f:
            source_feed.image_url = f["icon"]

        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                logging.info("EXISTING: %s", guid)
            except Exception as ex:
                logging.info("Creating new post %s.", guid)
                p = Post(index=0, body=' ')
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(feedparser)
            body = feedparser._sanitizeHTML(
                body, "utf-8", 'text/html')  # TODO: validate charset ??
            _customize_sanitizer(feedparser)
            title = feedparser._sanitizeHTML(
                title, "utf-8", 'text/html')  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ''

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                logging.exception('Unable to parse published date.')
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:
                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True
                                ee.length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee.type = typ
                                ee.save()
                                break

                    # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures.
                    # if not found_enclosure:
                    # ee.delete()

                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:
                        try:
                            # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure,
                            # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet.
                            # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out
                            # all known tracker prefixes.
                            if pe["url"] not in seen_files and not p.enclosures.all(
                            ).exists():
                                length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=typ)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                logging.exception("No enclosures")

            try:
                p.body = body
                p.save()
            except Exception as ex:
                logging.exception('Unable to save body A2.')

    return (ok, changed)
Exemple #6
0
def update_site_feed(site):
    '''This functions handles the feed update of site and is kind of recursive,
    since in the end it will call another apply_async onto himself'''
    # Avoids running two instances at the time
    cachekey = SITE_WORKER_CACHE_KEY.format(id=site.id)
    if cache.get(cachekey):
        logger.warn('Worker for site {} still running'.format(site.id))
        return
    
    cache.add(cachekey, '1', 60) # Will not run again in 60 seconds        
    
    from feeds.models import Post
    # Update task_id for this site
    site.task_id = update_site_feed.request.id
    site.save()
    
    feed = site.getfeed()
    # Update this site info
    if not 'feed' in feed:
        logger.warn(u"Site {} feed did not returned feed information".format(site.id))
        if 'feed_error' in feed:
            logger.error('Site {} is with its feed url broken'.format(site.id))
            # TODO: Create a task to use site.url to discover its new feed location
            site.feed_errors += 1
            site.save()
    else:
        info = feed['feed']
        if 'title' in info:
            site.title = info['title']
        if 'link' in info:
            site.url = info['link']
        if site.feed_errors > 0:
            site.feed_errors = 0
        site.save()

    # Create posts
    if not 'entries' in feed:
        logger.warn(u"Site {} feed did not returned any post".format(site.id))
    else:
        new_posts_found = 0
        for entry in feed['entries']:
            # Without link we can't save this post
            if not 'link' in entry:
                continue
            url = entry['link']
            title = entry.get('title', '')
            
            # Try to get content
            if isinstance(entry.get('content'), list):
                try:
                    for content in entry['content']:
                        if content:
                            break
                except IndexError:
                    content = u''
            else:
                content = entry.get('content')
                
            if not content and 'description' in entry:
                content = entry['description']
                
                
            if isinstance(content, dict):
                content = content.get('value')
                
            author = entry.get('author')
            
            if 'published_parsed' in entry and entry.get('published_parsed'):
                created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed']))
                try:
                    created_at = make_aware(created_at, get_current_timezone())
                except AmbiguousTimeError,e:
                    logger.error('Failed when tring to make {} aware'.format(created_at))
                    created_at = timezone.now()
            else:
                created_at = timezone.now()
               
            try: 
                post, created = site.posts.get_or_create(url_hash=Post.hashurl(url),
                    defaults={
                        'title': title,
                        'url': url,
                        'content': content,
                        'author': author,
                    }
                )
            except IntegrityError:
                # Raised when two posts have the same URL
                pass
            else:
                if created:
                    new_posts_found += 1
                post.created_at = created_at
                post.save()
    
        logger.info('Site {site_id} got {new} new posts from {total} in feed'.format(site_id=site.id, new=new_posts_found, total=len(feed['entries'])))
Exemple #7
0
    address.save()

    user = User()
    user.username = u['username']
    user.email = u['email']
    user.password = '******'
    user.address = address
    user.save()

    profile = Profile()
    profile.user = user
    profile.address = address
    profile.save()

posts = dados['posts']

for p in posts:
    post = Post()
    post.profile = Profile.objects.get(user=User.objects.get(id=p['userId']))
    post.body = p.get('body') or 'sem corpo'
    post.title = p.get('title') or 'sem title'
    post.save()

comments = dados['comments']
for c in comments:
    com = Comment()
    com.name = c['name']
    com.email = c['email']
    com.body = c['body']
    com.post = Post.objects.get(pk=c['postId'])
    com.save()