Example #1
0
def parse_feed_xml(source_feed, feed_content, output):

    ok = True
    changed = False

    # output.write(ret.content)
    try:

        _customize_sanitizer(parser)
        f = parser.parse(
            feed_content)  # need to start checking feed parser errors here
        entries = f["entries"]
        if len(entries):
            source_feed.last_success = (
                timezone.now()
            )  # in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        ok = False

    if ok:
        try:
            source_feed.name = update_source_name(source_feed.name,
                                                  f.feed.title)
        except Exception:
            pass

        try:
            source_feed.site_url = f.feed.link
        except Exception:
            pass

        try:
            source_feed.image_url = f.feed.image.href
        except Exception:
            pass

        # either of these is fine, prefer description over summary
        # also feedparser will give us itunes:summary etc if there
        try:
            source_feed.description = f.feed.summary
        except Exception:
            pass

        try:
            source_feed.description = f.feed.description
        except Exception:
            pass

        # output.write(entries)
        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            # we are going to take the longest
            body = ""

            if hasattr(e, "content"):
                for c in e.content:
                    if len(c.value) > len(body):
                        body = c.value

            if hasattr(e, "summary"):
                if len(e.summary) > len(body):
                    body = e.summary

            if hasattr(e, "summary_detail"):
                if len(e.summary_detail.value) > len(body):
                    body = e.summary_detail.value

            if hasattr(e, "description"):
                if len(e.description) > len(body):
                    body = e.description

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e.guid
            except Exception as ex:
                try:
                    guid = e.link
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                output.write("EXISTING " + guid + "\n")

            except Exception as ex:
                output.write("NEW " + guid + "\n")
                p = Post(index=0, body=" ")
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e.title
            except Exception:
                title = ""

            try:
                p.link = e.link
            except Exception:
                p.link = ""
            p.title = title

            try:
                p.image_url = e.image.href
            except Exception:
                pass

            try:
                # If there is no published_parsed entry, try updated_parsed
                if "published_parsed" in e:
                    time_struct = e.published_parsed
                else:
                    time_struct = e.updated_parsed

                p.created = datetime.datetime.fromtimestamp(
                    time.mktime(time_struct)).replace(tzinfo=timezone.utc)

            except Exception:
                output.write("CREATED ERROR")

            p.guid = guid
            try:
                p.author = e.author
            except Exception as ex:
                p.author = ""

            try:
                p.save()
                # output.write(p.body)
            except Exception as ex:
                # import pdb; pdb.set_trace()
                output.write(str(ex))

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    for pe in e["enclosures"]:

                        if pe["href"] == ee.href and ee.href not in seen_files:
                            found_enclosure = True

                            try:
                                ee.length = int(pe["length"])
                            except Exception:
                                ee.length = 0

                            try:
                                file_type = pe["type"]
                            except Exception:
                                file_type = "audio/mpeg"  # we are assuming podcasts here but that's probably not safe

                            ee.type = file_type
                            ee.save()
                            break
                    if not found_enclosure:
                        ee.delete()
                    seen_files.append(ee.href)

                for pe in e["enclosures"]:
                    try:
                        if pe["href"] not in seen_files:

                            try:
                                length = int(pe["length"])
                            except Exception:
                                length = 0

                            try:
                                file_type = pe["type"]
                            except Exception:
                                file_type = "audio/mpeg"

                            ee = Enclosure(post=p,
                                           href=pe["href"],
                                           length=length,
                                           type=file_type)
                            ee.save()
                    except Exception:
                        pass
            except Exception as ex:
                if output:
                    output.write("No enclosures - " + str(ex))

            try:
                p.body = body
                p.save()
                # output.write(p.body)
            except Exception as ex:
                output.write(str(ex))
                output.write(p.body)

            try:
                if "tags" in e:
                    for t in e.tags:
                        tag, created = Tag.objects.get_or_create(**t)
                        p.tags.add(tag)
                        print(f"Tag {tag} added to post {p}")
            except Exception as ex:
                output.write(str(ex))
                output.write(f"couldn't add tag {tag} to post {p}")

    return (ok, changed)
Example #2
0
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f["items"]
        if len(entries):
            source_feed.last_success = (
                timezone.now()
            )  # in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = 24 * 3 * 60
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            source_feed.name = update_source_name(source_feed.name, f["title"])
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(parser)
            source_feed.description = parser._sanitizeHTML(
                f["description"], "utf-8", "text/html")

        _customize_sanitizer(parser)
        source_feed.name = update_source_name(
            source_feed.name,
            parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"),
        )

        if "icon" in f:
            source_feed.image_url = f["icon"]

        # output.write(entries)
        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                output.write("EXISTING " + guid + "\n")

            except Exception as ex:
                output.write("NEW " + guid + "\n")
                p = Post(index=0, body=" ")
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(parser)
            body = parser._sanitizeHTML(
                body, "utf-8", "text/html")  # TODO: validate charset ??
            _customize_sanitizer(parser)
            title = parser._sanitizeHTML(
                title, "utf-8", "text/html")  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ""

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                output.write("CREATED ERROR")
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:

                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True

                                try:
                                    ee.length = int(pe["size_in_bytes"])
                                except:
                                    ee.length = 0

                                try:
                                    file_type = pe["mime_type"]
                                except:
                                    file_type = "audio/mpeg"  # we are assuming podcasts here but that's probably not safe

                                ee.type = file_type
                                ee.save()
                                break
                    if not found_enclosure:
                        ee.delete()
                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:

                        try:
                            if pe["url"] not in seen_files:

                                try:
                                    length = int(pe["size_in_bytes"])
                                except:
                                    length = 0

                                try:
                                    filetype = pe["mime_type"]
                                except:
                                    filetype = "audio/mpeg"

                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=filetype)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                if output:
                    output.write("No enclosures - " + str(ex))

            try:
                p.body = body
                p.save()
                # output.write(p.body)
            except Exception as ex:
                output.write(str(ex))
                output.write(p.body)

            try:
                if "tags" in e:
                    for t in e["tags"]:
                        tag, created = Tag.objects.get_or_create(**t)
                        p.tags.add(tag)
                        print(f"Tag {tag} added to post {p}")
            except Exception as ex:
                output.write(str(ex))
                output.write(f"couldn't add tag {tag} to post {p}")

    return (ok, changed)
Example #3
0
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f['items']
        if entries:
            source_feed.last_success = timezone.now(
            )  #in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = (24 * 3 * 60)
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            if not source_feed.name:
                source_feed.name = f["title"]
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(feedparser)
            source_feed.description = feedparser._sanitizeHTML(
                f["description"], "utf-8", 'text/html')

        _customize_sanitizer(feedparser)
        if not source_feed.name:
            source_feed.name = feedparser._sanitizeHTML(
                source_feed.name, "utf-8", 'text/html')

        if "icon" in f:
            source_feed.image_url = f["icon"]

        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                logging.info("EXISTING: %s", guid)
            except Exception as ex:
                logging.info("Creating new post %s.", guid)
                p = Post(index=0, body=' ')
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(feedparser)
            body = feedparser._sanitizeHTML(
                body, "utf-8", 'text/html')  # TODO: validate charset ??
            _customize_sanitizer(feedparser)
            title = feedparser._sanitizeHTML(
                title, "utf-8", 'text/html')  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ''

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                logging.exception('Unable to parse published date.')
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:
                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True
                                ee.length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee.type = typ
                                ee.save()
                                break

                    # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures.
                    # if not found_enclosure:
                    # ee.delete()

                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:
                        try:
                            # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure,
                            # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet.
                            # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out
                            # all known tracker prefixes.
                            if pe["url"] not in seen_files and not p.enclosures.all(
                            ).exists():
                                length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=typ)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                logging.exception("No enclosures")

            try:
                p.body = body
                p.save()
            except Exception as ex:
                logging.exception('Unable to save body A2.')

    return (ok, changed)