def parse_feed_xml(source_feed, feed_content, output): ok = True changed = False # output.write(ret.content) try: _customize_sanitizer(parser) f = parser.parse( feed_content) # need to start checking feed parser errors here entries = f["entries"] if len(entries): source_feed.last_success = ( timezone.now() ) # in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] ok = False if ok: try: source_feed.name = update_source_name(source_feed.name, f.feed.title) except Exception: pass try: source_feed.site_url = f.feed.link except Exception: pass try: source_feed.image_url = f.feed.image.href except Exception: pass # either of these is fine, prefer description over summary # also feedparser will give us itunes:summary etc if there try: source_feed.description = f.feed.summary except Exception: pass try: source_feed.description = f.feed.description except Exception: pass # output.write(entries) entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: # we are going to take the longest body = "" if hasattr(e, "content"): for c in e.content: if len(c.value) > len(body): body = c.value if hasattr(e, "summary"): if len(e.summary) > len(body): body = e.summary if hasattr(e, "summary_detail"): if len(e.summary_detail.value) > len(body): body = e.summary_detail.value if hasattr(e, "description"): if len(e.description) > len(body): body = e.description body = fix_relative(body, source_feed.site_url) try: guid = e.guid except Exception as ex: try: guid = e.link except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ") p.found = timezone.now() changed = True p.source = source_feed try: title = e.title except Exception: title = "" try: p.link = e.link except Exception: p.link = "" p.title = title try: p.image_url = e.image.href except Exception: pass try: # If there is no published_parsed entry, try updated_parsed if "published_parsed" in e: time_struct = e.published_parsed else: time_struct = e.updated_parsed p.created = datetime.datetime.fromtimestamp( time.mktime(time_struct)).replace(tzinfo=timezone.utc) except Exception: output.write("CREATED ERROR") p.guid = guid try: p.author = e.author except Exception as ex: p.author = "" try: p.save() # output.write(p.body) except Exception as ex: # import pdb; pdb.set_trace() output.write(str(ex)) try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False for pe in e["enclosures"]: if pe["href"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["length"]) except Exception: ee.length = 0 try: file_type = pe["type"] except Exception: file_type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = file_type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) for pe in e["enclosures"]: try: if pe["href"] not in seen_files: try: length = int(pe["length"]) except Exception: length = 0 try: file_type = pe["type"] except Exception: file_type = "audio/mpeg" ee = Enclosure(post=p, href=pe["href"], length=length, type=file_type) ee.save() except Exception: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: if "tags" in e: for t in e.tags: tag, created = Tag.objects.get_or_create(**t) p.tags.add(tag) print(f"Tag {tag} added to post {p}") except Exception as ex: output.write(str(ex)) output.write(f"couldn't add tag {tag} to post {p}") return (ok, changed)
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f["items"] if len(entries): source_feed.last_success = ( timezone.now() ) # in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = 24 * 3 * 60 source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] source_feed.name = update_source_name(source_feed.name, f["title"]) except Exception as ex: pass if "description" in f: _customize_sanitizer(parser) source_feed.description = parser._sanitizeHTML( f["description"], "utf-8", "text/html") _customize_sanitizer(parser) source_feed.name = update_source_name( source_feed.name, parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"), ) if "icon" in f: source_feed.image_url = f["icon"] # output.write(entries) entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] output.write("EXISTING " + guid + "\n") except Exception as ex: output.write("NEW " + guid + "\n") p = Post(index=0, body=" ") p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(parser) body = parser._sanitizeHTML( body, "utf-8", "text/html") # TODO: validate charset ?? _customize_sanitizer(parser) title = parser._sanitizeHTML( title, "utf-8", "text/html") # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = "" p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: output.write("CREATED ERROR") p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True try: ee.length = int(pe["size_in_bytes"]) except: ee.length = 0 try: file_type = pe["mime_type"] except: file_type = "audio/mpeg" # we are assuming podcasts here but that's probably not safe ee.type = file_type ee.save() break if not found_enclosure: ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: if pe["url"] not in seen_files: try: length = int(pe["size_in_bytes"]) except: length = 0 try: filetype = pe["mime_type"] except: filetype = "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=filetype) ee.save() except Exception as ex: pass except Exception as ex: if output: output.write("No enclosures - " + str(ex)) try: p.body = body p.save() # output.write(p.body) except Exception as ex: output.write(str(ex)) output.write(p.body) try: if "tags" in e: for t in e["tags"]: tag, created = Tag.objects.get_or_create(**t) p.tags.add(tag) print(f"Tag {tag} added to post {p}") except Exception as ex: output.write(str(ex)) output.write(f"couldn't add tag {tag} to post {p}") return (ok, changed)
def parse_feed_json(source_feed, feed_content, output): ok = True changed = False try: f = json.loads(feed_content) entries = f['items'] if entries: source_feed.last_success = timezone.now( ) #in case we start auto unsubscribing long dead feeds else: source_feed.last_result = "Feed is empty" source_feed.interval += 120 ok = False except Exception as ex: source_feed.last_result = "Feed Parse Error" entries = [] source_feed.interval += 120 ok = False if ok: if "expired" in f and f["expired"]: # This feed says it is done # TODO: permanently disable # for now source_feed.interval to max source_feed.interval = (24 * 3 * 60) source_feed.last_result = "This feed has expired" return (False, False, source_feed.interval) try: source_feed.site_url = f["home_page_url"] if not source_feed.name: source_feed.name = f["title"] except Exception as ex: pass if "description" in f: _customize_sanitizer(feedparser) source_feed.description = feedparser._sanitizeHTML( f["description"], "utf-8", 'text/html') _customize_sanitizer(feedparser) if not source_feed.name: source_feed.name = feedparser._sanitizeHTML( source_feed.name, "utf-8", 'text/html') if "icon" in f: source_feed.image_url = f["icon"] entries.reverse( ) # Entries are typically in reverse chronological order - put them in right order for e in entries: body = " " if "content_text" in e: body = e["content_text"] if "content_html" in e: body = e["content_html"] # prefer html over text body = fix_relative(body, source_feed.site_url) try: guid = e["id"] except Exception as ex: try: guid = e["url"] except Exception as ex: m = hashlib.md5() m.update(body.encode("utf-8")) guid = m.hexdigest() try: p = Post.objects.filter(source=source_feed).filter( guid=guid)[0] logging.info("EXISTING: %s", guid) except Exception as ex: logging.info("Creating new post %s.", guid) p = Post(index=0, body=' ') p.found = timezone.now() changed = True p.source = source_feed try: title = e["title"] except Exception as ex: title = "" # borrow the RSS parser's sanitizer _customize_sanitizer(feedparser) body = feedparser._sanitizeHTML( body, "utf-8", 'text/html') # TODO: validate charset ?? _customize_sanitizer(feedparser) title = feedparser._sanitizeHTML( title, "utf-8", 'text/html') # TODO: validate charset ?? # no other fields are ever marked as |safe in the templates if "banner_image" in e: p.image_url = e["banner_image"] if "image" in e: p.image_url = e["image"] try: p.link = e["url"] except Exception as ex: p.link = '' p.title = title try: p.created = pyrfc3339.parse(e["date_published"]) except Exception as ex: logging.exception('Unable to parse published date.') p.created = timezone.now() p.guid = guid try: p.author = e["author"] except Exception as ex: p.author = "" p.save() try: seen_files = [] for ee in list(p.enclosures.all()): # check existing enclosure is still there found_enclosure = False if "attachments" in e: for pe in e["attachments"]: if pe["url"] == ee.href and ee.href not in seen_files: found_enclosure = True ee.length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee.type = typ ee.save() break # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures. # if not found_enclosure: # ee.delete() seen_files.append(ee.href) if "attachments" in e: for pe in e["attachments"]: try: # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure, # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet. # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out # all known tracker prefixes. if pe["url"] not in seen_files and not p.enclosures.all( ).exists(): length = int( pe.get("size_in_bytes", None) or 0) typ = pe.get("mime_type", None) or "audio/mpeg" ee = Enclosure(post=p, href=pe["url"], length=length, type=typ) ee.save() except Exception as ex: pass except Exception as ex: logging.exception("No enclosures") try: p.body = body p.save() except Exception as ex: logging.exception('Unable to save body A2.') return (ok, changed)