Beispiel #1
0
def process_feed(feed_url, create=False, category_title=None):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """

    def normalize_tag(tag):
        """
        converts things like "-noise-" to "noise" and "- noise -" to "noise"
        """
        if tag.startswith("-"):
            tag = tag[1:]
        if tag.endswith("-"):
            tag = tag[:-1]

        ## fix for HTML entities
        tag = unicode(BeautifulStoneSoup(tag,
                        convertEntities=BeautifulStoneSoup.HTML_ENTITIES ))
        tag = tag.strip().lower()
        return tag

    try:
        USER_AGENT = settings.PLANET["USER_AGENT"]
    except (KeyError, AttributeError):
        print """Please set PLANET = {" USER_AGENT": <string>} in your settings.py"""
        exit(0)

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    print "*" * 20
    print "Feed: %s" % feed_url

    if create and planet_feed:
        # can't create it due to it already exists
        print "This feed already exists!"
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        print "This feed does not exist!"
        exit(0)

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url, agent=USER_AGENT,
                                modified=modified, etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags
        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        guid = unicode(md5(document.feed.get("link")).hexdigest())
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')

        updated_parsed = document.get("updated_parsed")
        if updated_parsed:
            last_modified = datetime.fromtimestamp(time.mktime(updated_parsed))
        else:
            last_modified = datetime.now()

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = filter(lambda item: item["rel"]=="alternate", feed_links)
            if link:
                blog_url = link[0]["href"]

        blog, created = Blog.objects.get_or_create(
            url=blog_url, defaults={"title": title})

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None

        if category_title:
            ##TODO: site_objects!
            category = Category.objects.get(title=category_title)
        else:
            category = None

        planet_feed = Feed(title=title, subtitle=subtitle, blog=blog,
            url=feed_url, rights=rights, info=info, guid=guid,
            image_url=image_url, icon_url=icon_url, language=language,
            etag=etag, last_modified=last_modified, generator=generator,
            is_active=True, last_checked=datetime.now(),
            site=current_site, category=category
        )
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")
            if name:
                print name

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type", "text/html"),
                link=link_dict.get("href", blog_url)
            )

    entries = []

    total_results = int(document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        print "No entries to store. status: %s %s" % (document.get("status"), document.get("debug_message"))
    else:
        print "Entries total count: %d" % total_results
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            print "Processing %d entries" % len(document.entries)

            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                guid = unicode(md5(entry.get("link")).hexdigest())
                content = entry.get('description') or entry.get("content", [{"value": ""}])[0]["value"]
                comments_url = entry.get("comments")
                date_modified = entry.get("updated_parsed") or\
                    entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(
                        time.mktime(date_modified))
                except Exception:
                    date_modified = None

                try:
                    if len(Post.objects.filter(url=url, guid=guid)):
                        raise PostAlreadyExists
                    post = Post(title=title, url=url, guid=guid, content=content,
                        comments_url=comments_url, date_modified=date_modified,
                        feed=planet_feed)
                    # To have the feed entry in the pre_save signal
                    post.entry = entry
                    post.save()
                except PostAlreadyExists:
                    print "Skipping post %s (%s) because already exists"\
                        % (guid, url)
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                else:
                    new_posts_count += 1
                    # create post tags...
                    for tag_dict in entry.get("tags", []):
                        tag_name = tag_dict.get("term") or tag_dict.get("label")
                        tag_name = normalize_tag(tag_name)

                        if len(tag_name) > 50: continue

                        try:
                            if "/" in tag_name:
                                # For path based categories
                                for subtag in tag_name.split("/"):
                                    if subtag:
                                        # empty string if starts/ends with slash
                                        Tag.objects.add_tag(post, '"%s"' % subtag)
                            else:
                                Tag.objects.add_tag(post, '"%s"' % tag_name)
                        except AttributeError, e:
                            print "Ignoring tag error: %s" % e
                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type", "text/html"),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--")
                        )

                    # create and store enclosures...
                    if entry.get('media_thumbnail', False):
                        try:
                            media_url = entry.get('media_thumbnail').href
                            media_list = [{"url": media_url}]
                        except AttributeError:
                            media_list = entry.get('media_thumbnail', [{"url": None}])

                        for media in media_list:
                            media_url = media["url"]
                            mime_type, enc = mimetypes.guess_type(urlparse(media_url).path)

                            post_enclosure, created = Enclosure.objects.get_or_create(
                                post=post,
                                length=0,
                                mime_type=mime_type,
                                link=media_url
                            )

                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href")
                        )
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")

                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=author, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=contributor, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor, post=post,
                                is_contributor=True)
                            pad.save()

                    # We send a post_created signal
                    print 'post_created.send(sender=post)', post
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "%s?start-index=%d&max-results=%d" %\
                    (feed_url, len(entries) + 1, items_per_page)

                print "retrieving %s..." % opensearch_url
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print "%d posts were created. Done." % new_posts_count
Beispiel #2
0
def process_feed(feed_url, create=False):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """

    def normalize_tag(tag):
        """
        converts things like "-noise-" to "noise" and "- noise -" to "noise"
        """
        if tag.startswith("-"):
            tag = tag[1:]
        if tag.endswith("-"):
            tag = tag[:-1]
        tag = tag.strip()
        return tag

    try:
        USER_AGENT = settings.USER_AGENT
    except AttributeError:
        # print "Please set the variable USER_AGENT = <string> in your settings.py"
        # exit(0)
        raise ValidationError("Please set the variable USER_AGENT = <string> in your settings.py")

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    print "*" * 20
    print "Feed: %s" % feed_url

    if create and planet_feed:
        # can't create it due to it already exists
        # print "This feed already exists!"
        # exit(0)
        raise ValidationError("This feed already exists!")

    if not create and not planet_feed:
        # can't update it due to it does not exist
        # print "This feed does not exist!"
        # exit(0)
        raise ValidationError("This feed does not exist!")

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url, agent=USER_AGENT, modified=modified, etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags

        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        guid = document.feed.get("id")
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", "")
        last_modified = document.get("updated_parsed", datetime.now())

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = filter(lambda item: item["rel"] == "alternate", feed_links)
            if link:
                blog_url = link[0]["href"]

        try:
            blog, created = Blog.objects.get_or_create(url=blog_url, defaults={"title": title})
        except:
            raise ValidationError(
                "Sorry, it doesn't look like this feed is formatted properly. Are you sure it's a valid RSS feed?"
            )

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"),
            )
        else:
            generator = None

        planet_feed = Feed(
            title=title,
            subtitle=subtitle,
            blog=blog,
            url=feed_url,
            rights=rights,
            info=info,
            guid=guid,
            image_url=image_url,
            icon_url=icon_url,
            language=language,
            etag=etag,
            last_modified=last_modified,
            generator=generator,
            is_active=True,
            last_checked=datetime.now(),
            site=current_site,
        )
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")
            if name:
                print name

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type"),
                link=link_dict.get("href", blog_url),
            )

    entries = []

    total_results = int(document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        print "No entries to store. status: %s %s" % (document.get("status"), document.get("debug_message"))
    else:
        print "Entries total count: %d" % total_results
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            print "Processing %d entries" % len(document.entries)

            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                guid = entry.get("link")
                content = entry.get("description") or entry.get("content", [{"value": ""}])[0]["value"]
                comments_url = entry.get("comments")
                date_modified = entry.get("updated_parsed") or entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(time.mktime(date_modified))
                except Exception:
                    date_modified = None

                try:
                    if len(Post.objects.filter(url=url, guid=guid)):
                        raise PostAlreadyExists
                    post = Post(
                        title=title,
                        url=url,
                        guid=guid,
                        content=content,
                        comments_url=comments_url,
                        date_modified=date_modified,
                        feed=planet_feed,
                    )
                    # To have the feed entry in the pre_save signal
                    post.entry = entry
                    post.save()
                except PostAlreadyExists:
                    print "Skipping post %s (%s) because already exists" % (guid, url)
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                else:
                    new_posts_count += 1
                    # create post tags...
                    tag_list = ""
                    for tag_dict in entry.get("tags", []):
                        tag_name = tag_dict.get("term") or tag_dict.get("label")
                        tag_name = tag_name[:255]
                        tag_name = normalize_tag(tag_name)
                        try:
                            if "/" in tag_name:
                                # For path based categories
                                for subtag in tag_name.split("/"):
                                    if subtag:
                                        # empty string if starts/ends with slash
                                        Tag.objects.add_tag(post, '"%s"' % subtag)
                                        tag_list = "%s %s" % (tag_list, subtag)
                            else:
                                Tag.objects.add_tag(post, '"%s"' % tag_name)
                                tag_list = "%s %s" % (tag_list, tag_name)
                        except AttributeError, e:
                            print "Ignoring tag error: %s" % e

                        post.tags = tag_list
                        post.save()
                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type"),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--"),
                        )

                    # create and store enclosures...

                    # NEW:
                    # media:thumbnail has attributes: url, height, width, time.
                    # see: http://www.rssboard.org/media-rss#media-thumbnails
                    # check if it's a list, cast to list if it's not
                    # iterate, and add as an enclosure
                    # store height, etc, in json (write a method on closure)

                    media_thumbnails = entry.get("media_thumbnail", False)
                    if media_thumbnails:
                        if not isinstance(media_thumbnails, list):
                            media_thumbnails = [media_thumbnails]

                        for media_thumbnail in media_thumbnails:
                            mime_type, enc = mimetypes.guess_type(urlparse(media_thumbnail.get("url")).path)

                            extra_info = {}
                            extra_info["width"] = media_thumbnail.get("width", None)
                            extra_info["height"] = media_thumbnail.get("height", None)
                            extra_info["time"] = media_thumbnail.get("time", None)

                            post_enclosure, created = Enclosure.objects.get_or_create(
                                post=post,
                                length=0,
                                mime_type=mime_type,
                                link=media_thumbnail.get("url"),
                                extra_info=extra_info,
                            )

                    # OLD:
                    # if entry.get('media_thumbnail', False):
                    #     mime_type, enc = mimetypes.guess_type(urlparse(entry.get('media_thumbnail').href).path)
                    #     post_enclosure, created = Enclosure.objects.get_or_create(
                    #         post=post,
                    #         length=0,
                    #         mime_type=mime_type,
                    #         link=entry.get('media_thumbnail').href
                    #     )
                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href"),
                        )
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")

                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href"),
                        )
                        try:
                            PostAuthorData.objects.get(author=author, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href"),
                        )
                        try:
                            PostAuthorData.objects.get(author=contributor, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor, post=post, is_contributor=True)
                            pad.save()

                    # We send a post_created signal
                    print "post_created.send(sender=post)", post
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "%s?start-index=%d&max-results=%d" % (feed_url, len(entries) + 1, items_per_page)

                print "retrieving %s..." % opensearch_url
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print "%d posts were created. Done." % new_posts_count
Beispiel #3
0
def process_feed(feed_url, create=False, category_title=None):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """
    def normalize_tag(tag):
        """
        converts things like "-noise-" to "noise" and "- noise -" to "noise"
        """
        if tag.startswith("-"):
            tag = tag[1:]
        if tag.endswith("-"):
            tag = tag[:-1]

        ## fix for HTML entities
        tag = BeautifulSoup(tag).prettify(formatter="html")
        tag = tag.strip().lower()
        return tag

    try:
        USER_AGENT = settings.PLANET["USER_AGENT"]
    except (KeyError, AttributeError):
        print(
            """Please set PLANET = {" USER_AGENT": <string>} in your settings.py"""
        )
        exit(0)

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    print("*" * 20)
    print("Feed: {}".format(feed_url))

    if create and planet_feed:
        # can't create it due to it already exists
        print("This feed already exists!")
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        print("This feed does not exist!")
        exit(0)

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url,
                                agent=USER_AGENT,
                                modified=modified,
                                etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags
        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        try:
            guid = unicode(md5(document.feed.get("link")).hexdigest())
        except NameError:
            guid = md5(document.feed.get("link").encode('utf-8')).hexdigest()
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')

        updated_parsed = document.get("updated_parsed")
        if updated_parsed:
            last_modified = datetime.fromtimestamp(time.mktime(updated_parsed))
        else:
            last_modified = datetime.now()

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = [item for item in feed_links if item["rel"] == "alternate"]
            if link:
                blog_url = link[0]["href"]

        blog, created = Blog.objects.get_or_create(url=blog_url,
                                                   defaults={"title": title})

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None

        if category_title:
            ##TODO: site_objects!
            category = Category.objects.get(title=category_title)
        else:
            category = None

        planet_feed = Feed(title=title,
                           subtitle=subtitle,
                           blog=blog,
                           url=feed_url,
                           rights=rights,
                           info=info,
                           guid=guid,
                           image_url=image_url,
                           icon_url=icon_url,
                           language=language,
                           etag=etag,
                           last_modified=last_modified,
                           generator=generator,
                           is_active=True,
                           last_checked=datetime.now(),
                           site=current_site,
                           category=category)
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type", "text/html"),
                link=link_dict.get("href", blog_url))

    entries = []

    total_results = int(
        document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        print("No entries to store. status: {} {}".format(
            document.get("status"), document.get("debug_message")))
    else:
        print("Entries total count: {}".format(total_results))
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            print("Processing {} entries".format(len(document.entries)))

            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                try:
                    guid = unicode(md5(entry.get("link")).hexdigest())
                except NameError:
                    guid = md5(entry.get("link").encode('utf-8')).hexdigest()
                content = entry.get('description') or entry.get(
                    "content", [{
                        "value": ""
                    }])[0]["value"]
                comments_url = entry.get("comments")
                try:
                    image_url = entry["media_thumbnail"][0]["url"]
                except Exception:
                    image_url = None
                date_modified = entry.get("updated_parsed") or\
                    entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(
                        time.mktime(date_modified))
                except Exception:
                    date_modified = None

                try:
                    if len(Post.objects.filter(url=url, guid=guid)):
                        raise PostAlreadyExists
#c=Cluster.objects.get(id=1)
#CREATE THE VECTOR FIELD MYVECTOR HERE
                    WORD = re.compile(r'\w+')
                    soup = BeautifulSoup(content)
                    combo = []  # desc and title
                    words = nltk.wordpunct_tokenize(soup.get_text())
                    titlez = nltk.wordpunct_tokenize(title)
                    words.extend(titlez)
                    for word in words:
                        if word in stopwords.words('english'):
                            words.remove(word)
                    for word in words:
                        combo.append(stemmer.stem(word))

                    lowerwords = [x.lower() for x in combo if len(x) > 1]

                    def text_to_vector(text):
                        words = WORD.findall(text.lower())
                        return Counter(words)

                    # Making vectors
                    vector = text_to_vector(str(lowerwords))
                    del vector['u']
                    vec33 = str(vector)
                    vec = vec33.replace("Counter", "")
                    print(vec)
                    c = Cluster.objects.get(id=1)
                    clus_id = c.cluster_id
                    i = 0
                    rank = 1
                    for posts in Post.objects.all():
                        threshold = 0
                        if posts.feed != planet_feed:
                            #vec11=posts.myvector.replace("Counter","")
                            #vec22=vector.replace("Counter","")
                            vec1 = ast.literal_eval(posts.myvector)
                            vec2 = ast.literal_eval(vec)
                            threshold = get_cosine(vec1, vec2)
                        if threshold > 0.28:
                            #matchposts[i]=posts
                            posts.rank += 1
                            posts.save()
                            i += 1
                            clus_id = posts.cluster_id
                    if i == 0:
                        c.cluster_id += 1
                        c.save()
                        clus_id = c.cluster_id
                        rank = 1
                    elif i > 0:
                        rank = i

                    post = Post(title=title,
                                url=url,
                                guid=guid,
                                content=content,
                                comments_url=comments_url,
                                image_url=image_url,
                                date_modified=date_modified,
                                feed=planet_feed,
                                cluster_id=clus_id,
                                rank=rank,
                                myvector=vec,
                                category=category)

                    # To have the feed entry in the pre_save signal
                    post.entry = entry
                    post.save()

                except PostAlreadyExists:
                    print("Skipping post {} ({}) because already exists"\
                        .format(guid, url))
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                else:
                    new_posts_count += 1
                    # create post tags...
                    for tag_dict in entry.get("tags", []):
                        tag_name = tag_dict.get("term") or tag_dict.get(
                            "label")
                        tag_name = normalize_tag(tag_name)

                        if len(tag_name) > 50: continue

                        try:
                            if "/" in tag_name:
                                # For path based categories
                                for subtag in tag_name.split("/"):
                                    if subtag:
                                        # empty string if starts/ends with slash
                                        Tag.objects.add_tag(
                                            post, '"%s"' % subtag)
                            else:
                                Tag.objects.add_tag(post, '"%s"' % tag_name)
                        except AttributeError as e:
                            print("Ignoring tag error: {}".format(e))
                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type", "text/html"),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--"))

                    # create and store enclosures...
                    if entry.get('media_thumbnail', False):
                        try:
                            media_url = entry.get('media_thumbnail').href
                            media_list = [{"url": media_url}]
                        except AttributeError:
                            media_list = entry.get('media_thumbnail',
                                                   [{
                                                       "url": None
                                                   }])

                        for media in media_list:
                            media_url = media["url"]
                            mime_type, enc = mimetypes.guess_type(
                                urlparse(media_url).path)

                            post_enclosure, created = Enclosure.objects.get_or_create(
                                post=post,
                                length=0,
                                mime_type=mime_type,
                                link=media_url)

                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href"))
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")

                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href"))
                        try:
                            PostAuthorData.objects.get(author=author,
                                                       post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href"))
                        try:
                            PostAuthorData.objects.get(author=contributor,
                                                       post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor,
                                                 post=post,
                                                 is_contributor=True)
                            pad.save()

                    # We send a post_created signal
                    print('post_created.send(sender=post)', post)
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "{}?start-index={}&max-results={}".format(\
                    feed_url, len(entries) + 1, items_per_page)

                print("retrieving {}...".format(opensearch_url))
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print("{} posts were created. Done.".format(new_posts_count))

    print()
    return new_posts_count
Beispiel #4
0
def process_feed(feed_url, owner_id=None, create=False, category_title=None):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """

    print("[process_feed] URL={}".format(feed_url))

    try:
        USER_AGENT = settings.PLANET["USER_AGENT"]
    except (KeyError, AttributeError):
        print(
            """Please set PLANET = {" USER_AGENT": <string>} in your settings.py""")
        exit(0)

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    print("*" * 20)
    print("Feed: {}".format(feed_url))

    if create and planet_feed:
        # can't create it due to it already exists
        print("This feed already exists!")
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        print("This feed does not exist!")
        exit(0)

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url, agent=USER_AGENT,
                                modified=modified, etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)


    if create:
        # then create blog, feed, generator, feed links and feed tags
        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")

        try:
            guid = str(md5(document.feed.get("link")).hexdigest())
        except:
            guid = md5(document.feed.get("link").encode('utf-8')).hexdigest()
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')

        updated_parsed = document.get("updated_parsed")
        if updated_parsed:
            last_modified = datetime.fromtimestamp(time.mktime(updated_parsed))
        else:
            last_modified = datetime.now()

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = [item for item in feed_links if item["rel"] == "alternate"]
            if link:
                blog_url = link[0]["href"]

        User = get_user_model()
        try:
            owner = User.objects.get(pk=owner_id)
        except User.DoesNotExist:
            owner = None

        blog, created = Blog.objects.get_or_create(
            url=blog_url, defaults={"title": title}, owner=owner, short_name = urlparse(blog_url).netloc)

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None

        if category_title:
            # TODO: site_objects!
            category = Category.objects.get(title=category_title)
        else:
            category = None

        planet_feed = Feed(title=title, subtitle=subtitle, blog=blog,
                           url=feed_url, rights=rights, info=info, guid=guid,
                           image_url=image_url, icon_url=icon_url, language=language,
                           etag=etag, last_modified=last_modified, generator=generator,
                           is_active=True, last_checked=datetime.now(),
                           site=current_site, category=category
                           )

        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type", "text/html"),
                link=link_dict.get("href", blog_url)
            )

    entries = []

    total_results = int(
        document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        print("No entries to store. status: {} {}".format(
            document.get("status"), document.get("debug_message")))
    else:
        print("Entries total count: {}".format(total_results))
        stop_retrieving = False


        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            print("Processing {} entries".format(len(document.entries)))

            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                try:
                    guid = unicode(md5(entry.get("link")).hexdigest())
                except NameError:
                    guid = md5(entry.get("link").encode('utf-8')).hexdigest()
                content = entry.get('description') or entry.get(
                    "content", [{"value": ""}])[0]["value"]
                comments_url = entry.get("comments")
                date_modified = entry.get("updated_parsed") or\
                     entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(
                        time.mktime(date_modified))
                except Exception:
                    date_modified = planet_feed.last_modified or datetime.now()

                try:
                    if len(Post.objects.filter(url=url, guid=guid)):
                        raise PostAlreadyExists
                    post = Post(title=title, url=url, guid=guid, content=content,
                                comments_url=comments_url, date_modified=date_modified,
                                feed=planet_feed)

                    select_matches = post.selectors()

                except PostAlreadyExists:
                    print("Skipping post {} ({}) because already exists"
                          .format(guid, url))
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                else:
                    if (select_matches or not PLANET_CONFIG["FILTER_WITH_SELECTORS"]):
                        print("matches!",select_matches,post.title)

                        post.entry = entry
                        post.save()

                        print(" -" * 20)
                        print(post)
                        print(" -" * 20)

                        new_posts_count += 1

                        make_entry_tags(entry, post)
                        make_selector_tags(select_matches, post)
                        make_links(entry, post)
                        make_enclosures(entry, post)
                        check_content_images(post)
                        make_author_contributors(entry, post)

                        # We send a post_created signal
                        print('post_created.send(sender=post)', post)
                        post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "{}?start-index={}&max-results={}".format(
                    feed_url, len(entries) + 1, items_per_page)

                print("retrieving {}...".format(opensearch_url))
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print("{} posts were created. Done.".format(new_posts_count))
    return new_posts_count
Beispiel #5
0
def process_feed(feed_url, create=False, category_title=None):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """
    def normalize_tag(tag):
        """
        converts things like "-noise-" to "noise" and "- noise -" to "noise"
        """
        if tag.startswith("-"):
            tag = tag[1:]
        if tag.endswith("-"):
            tag = tag[:-1]

        ## fix for HTML entities
        tag = unicode(
            BeautifulStoneSoup(
                tag, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
        tag = tag.strip().lower()
        return tag

    try:
        USER_AGENT = settings.USER_AGENT
    except AttributeError:
        print "Please set the variable USER_AGENT = <string> in your settings.py"
        exit(0)

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    print "*" * 20
    print "Feed: %s" % feed_url

    if create and planet_feed:
        # can't create it due to it already exists
        print "This feed already exists!"
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        print "This feed does not exist!"
        exit(0)

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url,
                                agent=USER_AGENT,
                                modified=modified,
                                etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags

        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        guid = document.feed.get("id")
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')
        last_modified = document.get("updated_parsed", datetime.now())

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = filter(lambda item: item["rel"] == "self", feed_links)
            if link:
                blog_url = link[0]["href"]
            else:
                link = filter(lambda item: item["rel"] == "alternate",
                              feed_links)
                if link:
                    blog_url = link[0]["href"]

        blog, created = Blog.objects.get_or_create(url=blog_url,
                                                   defaults={"title": title})

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None

        if category_title:
            ##TODO: site_objects!
            category = Category.objects.get(title=category_title)
        else:
            category = None

        planet_feed = Feed(title=title,
                           subtitle=subtitle,
                           blog=blog,
                           url=feed_url,
                           rights=rights,
                           info=info,
                           guid=guid,
                           image_url=image_url,
                           icon_url=icon_url,
                           language=language,
                           etag=etag,
                           last_modified=last_modified,
                           generator=generator,
                           is_active=True,
                           last_checked=datetime.now(),
                           site=current_site,
                           category=category)
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")
            if name:
                print name

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type") or '',
                link=link_dict.get("href", blog_url))

    entries = []

    total_results = int(
        document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        print "No entries to store. status: %s %s" % (
            document.get("status"), document.get("debug_message"))
    else:
        print "Entries total count: %d" % total_results
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            print "Processing %d entries" % len(document.entries)

            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                guid = entry.get("id")
                content = entry.get('description') or entry.get(
                    "content", [{
                        "value": ""
                    }])[0]["value"]
                comments_url = entry.get("comments")
                date_modified = entry.get("updated_parsed") or\
                    entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(
                        time.mktime(date_modified))
                except Exception:
                    date_modified = None

                try:
                    if len(Post.objects.filter(feed=planet_feed, guid=guid)):
                        raise PostAlreadyExists
                    post = Post(title=title,
                                url=url,
                                guid=guid,
                                content=content,
                                comments_url=comments_url,
                                date_modified=date_modified,
                                feed=planet_feed)
                    # To have the feed entry in the pre_save signal
                    post.entry = entry
                    post.save()
                except PostAlreadyExists:
                    print "Skipping post %s (%s) because already exists"\
                        % (guid, url)
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                else:
                    new_posts_count += 1
                    # create post tags...
                    for tag_dict in entry.get("tags", []):
                        tag_name = tag_dict.get("term") or tag_dict.get(
                            "label")
                        tag_name = normalize_tag(tag_name)

                        if len(tag_name) > 50: continue

                        try:
                            if "/" in tag_name:
                                # For path based categories
                                for subtag in tag_name.split("/"):
                                    if subtag:
                                        # empty string if starts/ends with slash
                                        Tag.objects.add_tag(
                                            post, '"%s"' % subtag)
                            else:
                                Tag.objects.add_tag(post, '"%s"' % tag_name)
                        except AttributeError, e:
                            print "Ignoring tag error: %s" % e
                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type", ""),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--"))

                    # create and store enclosures...
                    for media_thumbnail in entry.get('media_thumbnail', []):
                        url = media_thumbnail.get('url')
                        mime_type, enc = mimetypes.guess_type(
                            urlparse(url).path)
                        post_enclosure, created = Enclosure.objects.get_or_create(
                            post=post,
                            length=0,
                            mime_type=mime_type or '',
                            link=url)
                    # if entry.get('media_thumbnail', False):
                    #     mime_type, enc = mimetypes.guess_type(urlparse(entry.get('media_thumbnail').href).path)
                    #     post_enclosure, created = Enclosure.objects.get_or_create(
                    #         post=post,
                    #         length=0,
                    #         mime_type=mime_type,
                    #         link=entry.get('media_thumbnail').href
                    #     )
                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href"))
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")

                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href"))
                        try:
                            PostAuthorData.objects.get(author=author,
                                                       post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href"))
                        try:
                            PostAuthorData.objects.get(author=contributor,
                                                       post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor,
                                                 post=post,
                                                 is_contributor=True)
                            pad.save()

                    # We send a post_created signal
                    print 'post_created.send(sender=post)', post
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "%s?start-index=%d&max-results=%d" %\
                    (feed_url, len(entries) + 1, items_per_page)

                print "retrieving %s..." % opensearch_url
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print "%d posts were created. Done." % new_posts_count
def process_feed(feed_url, create=False, category_title=None):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """


    try:
        USER_AGENT = settings.USER_AGENT
    except AttributeError:
        plogger.error("Please set the variable USER_AGENT = <string> in your settings.py")
        exit(0)

    feed_url = str(feed_url).strip()

    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    plogger.debug("*" * 20)
    plogger.debug("Feed: %s" % feed_url)

    if create and planet_feed:
        # can't create it due to it already exists
        plogger.error("This feed already exists!")
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        plogger.error("This feed does not exist!")
        exit(0)

    # retrieve and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None

    document = feedparser.parse(feed_url, agent=USER_AGENT,
                                modified=modified, etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags

        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        guid = document.feed.get("id")
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')
        last_modified = datetime.fromtimestamp(time.mktime(document.get("updated_parsed", time.localtime())))

        feed_links = document.feed.get("links", [])
        if not blog_url:
            link = filter(lambda item: item["rel"]=="alternate", feed_links)
            if link:
                blog_url = link[0]["href"]

        blog, created = Blog.objects.get_or_create(
            url=blog_url, defaults={"title": title})

        generator_dict = document.feed.get("generator_detail", {})

        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None

        if category_title:
            ##TODO: site_objects!
            category = Category.objects.get(title=category_title)
        else:
            category = None

        if not isinstance(last_modified, datetime):
            last_modified = datetime.fromtimestamp(time.mktime(last_modified))

        planet_feed = Feed(title=title, subtitle=subtitle, blog=blog,
            url=feed_url, rights=rights, info=info, guid=guid,
            image_url=image_url, icon_url=icon_url, language=language,
            etag=etag, last_modified=last_modified, generator=generator,
            is_active=True, last_checked=datetime.now(),
            site=current_site, category=category
        )
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")
            if name:
                plogger.debug(name)

        for link_dict in feed_links:
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type"),
                link=link_dict.get("href", blog_url)
            )

    entries = []

    total_results = int(document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    new_posts_count = 0

    if total_results == 0:
        plogger.debug("No entries to store. status: %s %s" % (document.get("status"), document.get("debug_message")))
    else:
        plogger.debug("Entries total count: %d" % total_results)
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrieve and store feed posts
            entries.extend(document.entries)
            plogger.debug("Processing %d entries" % len(document.entries))

            for entry in document.entries:
                url = entry.get("link")
                guid = entry.get("link", "")

                if Post.objects.filter(url=url, guid=guid):
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                        break
                    else:
                        plogger.debug("Skipping post %s (%s) because already exists" % (guid, url))
                        continue
                else:

                    post = createPost(entry, planet_feed)
                    new_posts_count += 1

                    # create post tags...
                    createTags(post, entry)

                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type"),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--")
                        )

                    # create and store enclosures...
                    if entry.get('media_thumbnail', False):

                        thumbnail = entry['media_thumbnail'][0]

                        mime_type, enc = mimetypes.guess_type(urlparse(thumbnail['url']).path)
                        post_enclosure, created = Enclosure.objects.get_or_create(
                            post=post,
                            length=0,
                            mime_type=mime_type,
                            link=thumbnail['url']
                        )
                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href")
                        )
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")

                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=author, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=contributor, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor, post=post,
                                is_contributor=True)
                            pad.save()

                    # We send a post_created signal
                    plogger.debug('post_created.send(sender=%s)', str(post))
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "%s?start-index=%d&max-results=%d" %\
                    (feed_url, len(entries) + 1, items_per_page)

                plogger.debug("retrieving %s..." % opensearch_url)
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        plogger.debug("%d posts were created. Done." % new_posts_count)

    
    return new_posts_count
Beispiel #7
0
def process_feed(feed_url, create=False):
    """
    Stores a feed, its related data, its entries and their related data.
    If create=True then it creates the feed, otherwise it only stores new
    entries  and their related data.
    """

    def normalize_tag(tag):
        """
        converts things like "-noise-" to "noise" and "- noise -" to "noise"
        """
        if tag.startswith("-"):
            tag = tag[1:]
        if tag.endswith("-"):
            tag = tag[:-1]
        tag = tag.strip()
        return tag

    try:
        USER_AGENT = settings.USER_AGENT
    except AttributeError:
        print "Please set the variable USER_AGENT = <string> in your settings.py"
        exit(0)

    feed_url = str(feed_url).strip()
    
    try:
        planet_feed = Feed.objects.get(url=feed_url)
    except Feed.DoesNotExist:
        planet_feed = None

    if create and planet_feed:
        # can't create it due to it already exists
        print "This feed already exists!"
        exit(0)

    if not create and not planet_feed:
        # can't update it due to it does not exist
        print "This feed does not exist!"
        exit(0)

    # retrive and parse feed using conditional GET method
    if not create:
        modified = datetime.timetuple(planet_feed.last_modified)
        etag = planet_feed.etag
        # update last checked datetime
        planet_feed.last_checked = datetime.now()
        planet_feed.save()
    else:
        modified = etag = None
    
    document = feedparser.parse(feed_url, agent=USER_AGENT,
        modified=modified, etag=etag)

    current_site = Site.objects.get(pk=settings.SITE_ID)

    if create:
        # then create blog, feed, generator, feed links and feed tags
        
        title = document.feed.get("title", "--")
        subtitle = document.feed.get("subtitle")
        blog_url = document.feed.get("link")
        rights = document.feed.get("rights") or document.feed.get("license")
        info = document.feed.get("info")
        guid = document.feed.get("id")
        image_url = document.feed.get("image", {}).get("href")
        icon_url = document.feed.get("icon")
        language = document.feed.get("language")
        etag = document.get("etag", '')
        last_modified = document.get("updated_parsed", datetime.now())

        blog, created = Blog.objects.get_or_create(
            url=blog_url, defaults={"title": title})

        generator_dict = document.feed.get("generator_detail", {})
    
        if generator_dict:
            generator, created = Generator.objects.get_or_create(
                name=generator_dict.get("name", "--"),
                link=generator_dict.get("link"),
                version=generator_dict.get("version"))
        else:
            generator = None
    
        planet_feed = Feed(title=title, subtitle=subtitle, blog=blog,
            url=feed_url, rights=rights, info=info, guid=guid,
            image_url=image_url, icon_url=icon_url, language=language,
            etag=etag, last_modified=last_modified, generator=generator,
            is_active=True, last_checked=datetime.now(),
            site=current_site
        )
        planet_feed.save()

        for tag_dict in document.feed.get("tags", []):
            name = tag_dict.get("term")
            if name:
                print name

        for link_dict in document.feed.get("links", []):
            feed_link, created = FeedLink.objects.get_or_create(
                feed=planet_feed,
                rel=link_dict.get("rel", "--"),
                mime_type=link_dict.get("type"),
                link=link_dict.get("href", blog_url)
            )

    entries = []
    total_results = int(document.feed.get("opensearch_totalresults", len(document.entries)))
    items_per_page = int(document.feed.get("opensearch_itemsperpage", 25))
    
    if total_results == 0:
        print "*" * 20
        print "Feed: %s" % planet_feed.url
        print "No entries to store. Exiting..."
    
    else:
        print "Entries total count: %d" % total_results
        print
        new_posts_count = 0
        stop_retrieving = False
        while (total_results > len(entries)) and not stop_retrieving:

            # retrive and store feed posts
            entries.extend(document.entries)
            print "Processing %d entries" % len(document.entries)
            
            for entry in document.entries:
                title = entry.get("title", "")
                url = entry.get("link")
                guid = entry.get("link")
                content = entry.get('description') or entry.get("content", [{"value": ""}])[0]["value"]
                comments_url = entry.get("comments")
                date_modified = entry.get("updated_parsed") or\
                    entry.get("published_parsed")
                try:
                    date_modified = datetime.fromtimestamp(
                        time.mktime(date_modified))
                except Exception:
                    date_modified = None

                try:
                    if len(Post.objects.filter(url=url, guid=guid)):
                        raise PostAlreadyExists
                    post = Post(title=title, url=url, guid=guid, content=content,
                        comments_url=comments_url, date_modified=date_modified,
                        feed=planet_feed)
                    # To have the feed entry in the pre_save signal
                    post.entry = entry
                    post.save()
                except PostAlreadyExists:
                    print "Skipping post %s (%s) because already exists"\
                        % (guid, url)
                    if not create:
                        # if it is in update-mode then stop retrieving when
                        # it finds repeated posts
                        stop_retrieving = True
                
                else:
                    new_posts_count += 1
                    # create post tags...
                    for tag_dict in entry.get("tags", []):
                        tag_name = tag_dict.get("term") or tag_dict.get("label")
                        tag_name = tag_name[:255]
                        tag_name = normalize_tag(tag_name)
                        Tag.objects.add_tag(post, '"%s"' % tag_name)

                    # create post links...
                    for link_dict in entry.get("links", []):
                        post_link, created = PostLink.objects.get_or_create(
                            post=post,
                            rel=link_dict.get("rel", "--"),
                            mime_type=link_dict.get("type"),
                            link=link_dict.get("href", "--"),
                            title=link_dict.get("title", "--")
                        )

                    # create and store enclosures...
                    if entry.get('media_thumbnail', False):
                        mime_type, enc = mimetypes.guess_type(urlparse(entry.get('media_thumbnail').href).path)
                        post_enclosure, created = Enclosure.objects.get_or_create(
                            post=post,
                            length=0,
                            mime_type=mime_type,
                            link=entry.get('media_thumbnail').href
                        )
                    for enclosure_dict in entry.get("enclosures", []):
                        post_enclosure = Enclosure(
                            post=post,
                            length=enclosure_dict.get("length", 0),
                            mime_type=enclosure_dict.get("type", ""),
                            link=enclosure_dict.get("href")
                        )
                        post_enclosure.save()

                    # create and store author...
                    author_dict = entry.get("author_detail")
                    if author_dict:
                        author, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=author_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=author, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=author, post=post)
                            pad.save()

                    # create and store contributors...
                    for contributor_dict in entry.get("contributors", []):
                        contributor, created = Author.objects.get_or_create(
                            name=author_dict.get("name", ""),
                            email=author_dict.get("email", ""),
                            profile_url=contributor_dict.get("href")
                        )
                        try:
                            PostAuthorData.objects.get(author=contributor, post=post)
                        except PostAuthorData.DoesNotExist:
                            pad = PostAuthorData(author=contributor, post=post,
                                is_contributor=True)
                            pad.save()
                    
                    # We send a post_created signal
                    print 'post_created.send(sender=post)', post
                    post_created.send(sender=post, instance=post)

            if not stop_retrieving:
                opensearch_url = "%s?start-index=%d&max-results=%d" %\
                    (feed_url, len(entries) + 1, items_per_page)

                print "retriving %s..." % opensearch_url
                document = feedparser.parse(opensearch_url, agent=USER_AGENT)

        print "*" * 20
        print "Feed: %s" % planet_feed.url
        if new_posts_count:
            # update last modified datetime
            planet_feed.last_modified = datetime.now()
            planet_feed.save()
        print "%d posts were created. Done." % new_posts_count