Exemple #1
0
    def update(self, parser_entry):
        for field in self._entry_fields:
            value = parser_entry.get(field)
            if getattr(self, field, _marker) != value:
                setattr(self, field, value)

        for field in self._date_fields:
            value = parser_entry.get(field + '_parsed')
            if value is not None:
                value = datetime.datetime(*value[:6])
            if getattr(self, field, _marker) != value:
                setattr(self, field, value)

        entry_content = parser_entry.get('content')
        if entry_content:
            content_list = []
            for content in entry_content:
                # *Always* sanitize the content HTML.
                # FeedParser sometimes doesn't sanitize, such as when
                # the input is base64 encoded.
                value = feedparser._sanitizeHTML(content.value, 'utf-8')
                content_list.append(value)
            content_html = '\n'.join(content_list)
        else:
            content_html = None
        if getattr(self, 'content_html', _marker) != content_html:
            self.content_html = content_html
Exemple #2
0
    def update(self, parser_entry):
        for field in self._entry_fields:
            value = parser_entry.get(field)
            if getattr(self, field, _marker) != value:
                setattr(self, field, value)

        for field in self._date_fields:
            value = parser_entry.get(field + '_parsed')
            if value is not None:
                value = datetime.datetime(*value[:6])
            if getattr(self, field, _marker) != value:
                setattr(self, field, value)

        entry_content = parser_entry.get('content')
        if entry_content:
            content_list = []
            for content in entry_content:
                # *Always* sanitize the content HTML.
                # FeedParser sometimes doesn't sanitize, such as when
                # the input is base64 encoded.
                value = feedparser._sanitizeHTML(content.value, 'utf-8')
                content_list.append(value)
            content_html = '\n'.join(content_list)
        else:
            content_html = None
        if getattr(self, 'content_html', _marker) != content_html:
            self.content_html = content_html
Exemple #3
0
    def from_feed_entry(entry):
        """Read and construct Post object from ``entry``.

        ``entry`` should be a post object as returned by ``feedparser.parse``.

        If the post is invalid, raise a ``MalformedPostError`.

        This leaves the `blog` field emtpy; this must be filled in before the
        post is added to the database.
        """
        for field in 'title', 'summary', 'link':
            if field not in entry:
                raise MalformedPostError("Post has no %s: %r" % (field, entry))
        post = Post()
        post.timestamp = to_dbtime(Post._get_pub_date(entry))
        post.title = entry['title']
        post.summary = entry['summary']
        if hasattr(entry, 'id'):
            post.guid = entry.id

        # The summary detail attribute lets us find the mime type of the
        # summary. feedparser doesn't escape it if it's text/plain, so we need
        # to do it ourselves. Unfortunately, there's a bug (likely #412) in
        # feedparser, and sometimes this attribute is unavailable. If it's
        # there, great, use it. Otherwise, we'll just assume it's html, and
        # sanitize it ourselves.
        if hasattr(entry, 'summary_detail'):
            mimetype = entry.summary_detail.type
        else:
            mimetype = 'application/xhtml'
            # Sanitize the html; who knows what feedparser did or didn't do.
            # XXX: _sanitizeHTML is a private function to the feedparser
            # library! unfortunately, we don't have many better options. This
            # statement is the reason the version number for the feedparser
            # dependency is fixed at 5.1.3; any alternate version will need to
            # be vetted carefully, as by doing this we lose any api stability
            # guarantees.
            post.summary = unicode(
                feedparser._sanitizeHTML(
                    # _sanitizeHTML expects an encoding, so rather than do more
                    # guesswork than we alredy have...
                    post.summary.encode('utf-8'),
                    'utf-8',
                    # _sanitizeHTML is only ever called within the library with
                    # this value:
                    u'text/html',
                ),
                'utf-8')

        if mimetype == 'text/plain':
            # feedparser doesn't sanitize the summary if it's plain text, so we
            # need to do it manually. We're using jijna2's autoscape feature
            # for this, which feels like a bit of a hack to me (Ian), but it
            # works -- there's probably a cleaner way to do this.
            tmpl = jinja2.Template('{{ text }}', autoescape=True)
            post.summary = tmpl.render(text=post.summary)
        post.page_url = entry['link']

        return post
Exemple #4
0
    def from_feed_entry(entry):
        """Read and construct Post object from ``entry``.

        ``entry`` should be a post object as returned by ``feedparser.parse``.

        If the post is invalid, raise a ``MalformedPostError`.

        This leaves the `blog` field emtpy; this must be filled in before the
        post is added to the database.
        """
        for field in 'title', 'summary', 'link':
            if field not in entry:
                raise MalformedPostError("Post has no %s: %r" % (field, entry))
        post = Post()
        post.timestamp = to_dbtime(Post._get_pub_date(entry))
        post.title = entry['title']
        post.summary = entry['summary']
        if hasattr(entry, 'id'):
            post.guid = entry.id

        # The summary detail attribute lets us find the mime type of the
        # summary. feedparser doesn't escape it if it's text/plain, so we need
        # to do it ourselves. Unfortunately, there's a bug (likely #412) in
        # feedparser, and sometimes this attribute is unavailable. If it's
        # there, great, use it. Otherwise, we'll just assume it's html, and
        # sanitize it ourselves.
        if hasattr(entry, 'summary_detail'):
            mimetype = entry.summary_detail.type
        else:
            mimetype = 'application/xhtml'
            # Sanitize the html; who knows what feedparser did or didn't do.
            # XXX: _sanitizeHTML is a private function to the feedparser
            # library! unfortunately, we don't have many better options. This
            # statement is the reason the version number for the feedparser
            # dependency is fixed at 5.1.3; any alternate version will need to
            # be vetted carefully, as by doing this we lose any api stability
            # guarantees.
            post.summary = unicode(feedparser._sanitizeHTML(
                # _sanitizeHTML expects an encoding, so rather than do more
                # guesswork than we alredy have...
                post.summary.encode('utf-8'),
                'utf-8',
                # _sanitizeHTML is only ever called within the library with
                # this value:
                u'text/html',
            ), 'utf-8')

        if mimetype == 'text/plain':
            # feedparser doesn't sanitize the summary if it's plain text, so we
            # need to do it manually. We're using jijna2's autoscape feature
            # for this, which feels like a bit of a hack to me (Ian), but it
            # works -- there's probably a cleaner way to do this.
            tmpl = jinja2.Template('{{ text }}', autoescape=True)
            post.summary = tmpl.render(text=post.summary)
        post.page_url = entry['link']

        return post
def sanitise_html(html):
    """ santise_html(html) returns some sanitised html.
          It can be used to try and avoid basic html insertion attacks.

        >>> sanitise_html("<p>hello</p>")
        '<p>hello</p>'
        >>> sanitise_html("<script>alert('what')</script>")
        ''
    """
    return feedparser._sanitizeHTML(html, "utf-8", "text/html")
Exemple #6
0
def sanitize_html(html, force_https=True):
    """ santise_html(html) returns some sanitized html.
          It can be used to try and avoid basic html insertion attacks.

        >>> sanitize_html("<p>hello</p>")
        '<p>hello</p>'
        >>> sanitize_html("<script>alert('what')</script>")
        ''
    """
    clean_html = feedparser._sanitizeHTML(html, "utf-8", "text/html")
    if force_https:
        return clean_html.replace('src="http://', 'src="https://')
    else:
        return clean_html
Exemple #7
0
def htmlmail(sbj,recip,msg,template='',texttemplate='',textmsg='',images=(), recip_name='',sender=settings.DEFAULT_FROM_EMAIL,sender_name='',charset=charset):
   """
   if you want to use Django template system:
      use `msg` and optionally `textmsg` as template context (dict)
      and define `template` and optionally `texttemplate` variables.
   otherwise msg and textmsg variables are used as html and text message sources.

   if you want to use images in html message, define physical paths and ids in tuples.
   (image paths are relative to  MEDIA_ROOT)
   example:
   images=(('email_images/logo.gif','img1'),('email_images/footer.gif','img2'))
   and use them in html like this:
   <img src="cid:img1">
   ...
   <img src="cid:img2">
   """
   html=render(msg,template)
   if texttemplate or textmsg: text=render((textmsg or msg),texttemplate)
   else: text= html2text(_sanitizeHTML(html,charset))

   msgRoot = MIMEMultipart('related')
   msgRoot['Subject'] = sbj
   msgRoot['From'] = named(sender,sender_name)
   msgRoot['To'] =  named(recip,recip_name)
   msgRoot.preamble = 'This is a multi-part message in MIME format.'

   msgAlternative = MIMEMultipart('alternative')
   msgRoot.attach(msgAlternative)
   
   msgAlternative.attach(MIMEText(text, _charset=charset))
   msgAlternative.attach(MIMEText(html, 'html', _charset=charset))

   for img in images:
      fp = open(settings.MEDIA_ROOT+img[0], 'rb')
      msgImage = MIMEImage(fp.read())
      fp.close()
      msgImage.add_header('Content-ID', '<'+img[1]+'>')
      msgRoot.attach(msgImage)

   smtp = SMTP()
   smtp.connect(smtp_server)
   if smtp_user: smtp.login(smtp_user, smtp_pass)
   smtp.sendmail(sender, recip, msgRoot.as_string())
   smtp.quit()
Exemple #8
0
def send_html_mail_nt(subject,
                      sender=settings.DEFAULT_FROM_EMAIL,
                      recip="",
                      context=None,
                      html_template="",
                      text_template="",
                      sender_name="",
                      html_content="",
                      text_content="",
                      recip_list=None,
                      sender_formatted=""):
    from stripogram import html2text
    from feedparser import _sanitizeHTML

    if not context: context = {}
    if html_template:
        html = render(context, html_template)
    else:
        html = html_content
    if text_template:
        text = render(context, text_template)
    else:
        text = text_content
    if not text:
        text = html2text(_sanitizeHTML(html, charset))

    if not recip_list: recip_list = []
    if recip: recip_list.append(recip)

    try:
        if getattr(settings, "EMAIL_USE_SSL", False):
            server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT)
        else:
            server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT)
        if settings.EMAIL_USE_TLS:
            server.ehlo()
            server.starttls()
            server.ehlo()
        if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD:
            server.login(settings.EMAIL_HOST_USER,
                         settings.EMAIL_HOST_PASSWORD)
    except Exception, e:
        print e
        return
def sanitise_html(html, baseurl, inline, config, type):
	"""Attempt to turn arbitrary feed-provided HTML into something
	suitable for safe inclusion into the rawdog output. The inline
	parameter says whether to expect a fragment of inline text, or a
	sequence of block-level elements."""
	if html is None:
		return None

	html = encode_references(html)

	# sgmllib handles "<br/>/" as a SHORTTAG; this workaround from
	# feedparser.
	html = re.sub(r'(\S)/>', r'\1 />', html)

	html = feedparser._resolveRelativeURIs(html, baseurl, "UTF-8", type)
	html = feedparser._sanitizeHTML(html, "UTF-8", type)

	if not inline and config["blocklevelhtml"]:
		# If we're after some block-level HTML and the HTML doesn't
		# start with a block-level element, then insert a <p> tag
		# before it. This still fails when the HTML contains text, then
		# a block-level element, then more text, but it's better than
		# nothing.
		if block_level_re.match(html) is None:
			html = "<p>" + html

	if config["tidyhtml"]:
		import mx.Tidy
		args = { "wrap": 0, "numeric_entities": 1 }
		plugins.call_hook("mxtidy_args", config, args, baseurl, inline)
		output = mx.Tidy.tidy(html, None, None,
		                      **args)[2]
		html = output[output.find("<body>") + 6
		              : output.rfind("</body>")].strip()

	html = html.decode("UTF-8")
	box = plugins.Box(html)
	plugins.call_hook("clean_html", config, box, baseurl, inline)
	return box.value
Exemple #10
0
def send_html_mail_nt(
    subject, sender=settings.DEFAULT_FROM_EMAIL, recip="", context=None, 
    html_template="", text_template="", sender_name="",
    html_content="", text_content="", recip_list=None, sender_formatted=""
):
    from stripogram import html2text
    from feedparser import _sanitizeHTML

    if not context: context = {}
    if html_template:
        html = render(context, html_template)
    else: html = html_content
    if text_template:
        text = render(context, text_template)
    else: text = text_content
    if not text:
        text = html2text(_sanitizeHTML(html,charset))        

    if not recip_list: recip_list = []
    if recip: recip_list.append(recip)

    try:
        if getattr(settings, "EMAIL_USE_SSL", False):
            server = SMTP_SSL(settings.EMAIL_HOST, settings.EMAIL_PORT)
        else:
            server = SMTP(settings.EMAIL_HOST, settings.EMAIL_PORT)
        if settings.EMAIL_USE_TLS:
            server.ehlo()
            server.starttls()
            server.ehlo()
        if settings.EMAIL_HOST_USER and settings.EMAIL_HOST_PASSWORD:
            server.login(
                settings.EMAIL_HOST_USER, settings.EMAIL_HOST_PASSWORD
            )
    except Exception, e: 
        print e
        return
Exemple #11
0
def home(request):
    #Get JSON data of RSS feed:
    watched_urls = WatchedUrl.objects.all() #need to get the full object
    
    for each_wu in watched_urls:
        #Encode the URI so that the key=value pairs don't get parsed in the Google url:
        url = urllib.quote(each_wu.url)
        
        #TODO: Differentiate between first import and subsequent imports. We do not
        #      need num=-1 and scoring=h for subsequent imports.
        gfeed_url = 'http://ajax.googleapis.com/ajax/services/feed/load?q=' + \
                    url + \
                    '&v=1.0&num=-1&scoring=h' + \
                    '&key=' + settings.GOOGLE_FEEDS_API_KEY
                    #The above key is for localhost only.
        json_result = urllib.urlopen(gfeed_url)

        #Parse the JSON string into python data structures:
        result = simplejson.loads(json_result.read()) #Use read() to return string
        entries = result['responseData']['feed']['entries']
        
        skip_count = 0
        for entry in entries:
            #Parse the date. We remove the last part which is the time offset since
            #python's support for parsing it is highly variable:
            temp = entry['publishedDate'].split()
            date_no_offset = ' '.join(temp[0:-1]) #remove the last field 
            dt = datetime.datetime.strptime(date_no_offset, 
                                            '%a, %d %b %Y %H:%M:%S')

            #Sanitize the HTML
            #Not necessarily needed. We can trust that both CL and Google Feeds do
            #some level of sanitization.
            entry['title'] = feedparser._sanitizeHTML(entry['title'], 'utf-8')
            entry['content'] = feedparser._sanitizeHTML(entry['content'], 'utf-8')

            #Put in database. To prevent duplicates, try to get the url first from
            #database:
            try:
                post = Post.objects.get(link = entry['link'])
                #If the post already exists, then skip it.
                skip_count += 1
                if skip_count >= settings.DUPLICATE_POSTS_THRESHOLD:
                    break #we don't need to keep updating anymore
                continue
            except Post.MultipleObjectsReturned:    
                #Technically, this shouldn't happen if other code works
                continue #skip creating new entry
            except Post.DoesNotExist:
                #Create a new entry
                post = Post()
            post.watched_url = each_wu
            post.date = dt
            post.title = entry['title']
            #post.content = pickle.dumps(entry) 
            post.content = entry['content']
            post.link = entry['link']
            post.save()

    #Now select the entries in chronological order
    entries = Post.objects.order_by('-date') #DESC order

    #Create add url form:
    add_url_form = AddURLForm()
        
    return render_to_response('home.html', {'entries': entries, 
                                            'add_url_form': add_url_form, })
def sanitize(value):
  return feedparser._sanitizeHTML(value, 'UTF-8', 'text/html')
Exemple #13
0
def parse_feed_json(source_feed, feed_content, interval, response):

    ok = True
    changed = False 

    try:
        f = json.loads(feed_content)
        entries = f['items']
        if len(entries):
            source_feed.last_success = datetime.datetime.utcnow().replace(tzinfo=utc) #in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        interval += 120
        ok = False
    
    if ok:
    
        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now interval to max
            interval = (24*3*60)
            source_feed.last_result = "This feed has expired"
            return (False,False,interval)

        try:
            source_feed.site_url = f["home_page_url"]
            source_feed.name = f["title"]
        except Exception as ex:
            pass
    

        #response.write(entries)
        entries.reverse() # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"] # prefer html over text
                
            body = fix_relative(body,source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()
                    
            try:
                p  = Post.objects.filter(source=source_feed).filter(guid=guid)[0]
                response.write("EXISTING " + guid + "\n")

            except Exception as ex:
                response.write("NEW " + guid + "\n")
                p = Post(index=0)
                p.found = datetime.datetime.utcnow().replace(tzinfo=utc)
                changed = True
                p.source = source_feed
    
            try:
                title = e["title"]
            except Exception as ex:
                title = ""      
                
            # borrow the RSS parser's sanitizer
            body  = feedparser._sanitizeHTML(body, "utf-8") # TODO: validate charset ??
            title = feedparser._sanitizeHTML(title, "utf-8") # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

                        
            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ''
            
            p.title = title

            try:
                p.created  = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                response.write("CREATED ERROR")     
                p.created  = datetime.datetime.utcnow().replace(tzinfo=utc)
        
        
            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            try:
                p.body = body                       
                p.save()
                # response.write(p.body)
            except Exception as ex:
                #response.write(str(sys.exc_info()[0]))
                response.write("\nSave error for post:" + str(sys.exc_info()[0]))
                traceback.print_tb(sys.exc_traceback,file=response)

    return (ok,changed,interval)
Exemple #14
0
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f["items"]
        if len(entries):
            source_feed.last_success = (
                timezone.now()
            )  # in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = 24 * 3 * 60
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            source_feed.name = update_source_name(source_feed.name, f["title"])
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(parser)
            source_feed.description = parser._sanitizeHTML(
                f["description"], "utf-8", "text/html")

        _customize_sanitizer(parser)
        source_feed.name = update_source_name(
            source_feed.name,
            parser._sanitizeHTML(source_feed.name, "utf-8", "text/html"),
        )

        if "icon" in f:
            source_feed.image_url = f["icon"]

        # output.write(entries)
        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                output.write("EXISTING " + guid + "\n")

            except Exception as ex:
                output.write("NEW " + guid + "\n")
                p = Post(index=0, body=" ")
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(parser)
            body = parser._sanitizeHTML(
                body, "utf-8", "text/html")  # TODO: validate charset ??
            _customize_sanitizer(parser)
            title = parser._sanitizeHTML(
                title, "utf-8", "text/html")  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ""

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                output.write("CREATED ERROR")
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:

                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True

                                try:
                                    ee.length = int(pe["size_in_bytes"])
                                except:
                                    ee.length = 0

                                try:
                                    file_type = pe["mime_type"]
                                except:
                                    file_type = "audio/mpeg"  # we are assuming podcasts here but that's probably not safe

                                ee.type = file_type
                                ee.save()
                                break
                    if not found_enclosure:
                        ee.delete()
                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:

                        try:
                            if pe["url"] not in seen_files:

                                try:
                                    length = int(pe["size_in_bytes"])
                                except:
                                    length = 0

                                try:
                                    filetype = pe["mime_type"]
                                except:
                                    filetype = "audio/mpeg"

                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=filetype)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                if output:
                    output.write("No enclosures - " + str(ex))

            try:
                p.body = body
                p.save()
                # output.write(p.body)
            except Exception as ex:
                output.write(str(ex))
                output.write(p.body)

            try:
                if "tags" in e:
                    for t in e["tags"]:
                        tag, created = Tag.objects.get_or_create(**t)
                        p.tags.add(tag)
                        print(f"Tag {tag} added to post {p}")
            except Exception as ex:
                output.write(str(ex))
                output.write(f"couldn't add tag {tag} to post {p}")

    return (ok, changed)
Exemple #15
0
def sanitize_html(html):
    return html  # widget mechanism requires iframe
    # !!! so, its not yet possible to make it right:
    return feedparser._sanitizeHTML(html, 'utf-8')
Exemple #16
0
def sanitize_html(html):
    return html  # widget mechanism requires iframe 
    # !!! so, its not yet possible to make it right:
    return feedparser._sanitizeHTML(html, 'utf-8')
Exemple #17
0
def parse_feed_json(source_feed, feed_content, output):

    ok = True
    changed = False

    try:
        f = json.loads(feed_content)
        entries = f['items']
        if entries:
            source_feed.last_success = timezone.now(
            )  #in case we start auto unsubscribing long dead feeds
        else:
            source_feed.last_result = "Feed is empty"
            source_feed.interval += 120
            ok = False

    except Exception as ex:
        source_feed.last_result = "Feed Parse Error"
        entries = []
        source_feed.interval += 120
        ok = False

    if ok:

        if "expired" in f and f["expired"]:
            # This feed says it is done
            # TODO: permanently disable
            # for now source_feed.interval to max
            source_feed.interval = (24 * 3 * 60)
            source_feed.last_result = "This feed has expired"
            return (False, False, source_feed.interval)

        try:
            source_feed.site_url = f["home_page_url"]
            if not source_feed.name:
                source_feed.name = f["title"]
        except Exception as ex:
            pass

        if "description" in f:
            _customize_sanitizer(feedparser)
            source_feed.description = feedparser._sanitizeHTML(
                f["description"], "utf-8", 'text/html')

        _customize_sanitizer(feedparser)
        if not source_feed.name:
            source_feed.name = feedparser._sanitizeHTML(
                source_feed.name, "utf-8", 'text/html')

        if "icon" in f:
            source_feed.image_url = f["icon"]

        entries.reverse(
        )  # Entries are typically in reverse chronological order - put them in right order
        for e in entries:
            body = " "
            if "content_text" in e:
                body = e["content_text"]
            if "content_html" in e:
                body = e["content_html"]  # prefer html over text

            body = fix_relative(body, source_feed.site_url)

            try:
                guid = e["id"]
            except Exception as ex:
                try:
                    guid = e["url"]
                except Exception as ex:
                    m = hashlib.md5()
                    m.update(body.encode("utf-8"))
                    guid = m.hexdigest()

            try:
                p = Post.objects.filter(source=source_feed).filter(
                    guid=guid)[0]
                logging.info("EXISTING: %s", guid)
            except Exception as ex:
                logging.info("Creating new post %s.", guid)
                p = Post(index=0, body=' ')
                p.found = timezone.now()
                changed = True
                p.source = source_feed

            try:
                title = e["title"]
            except Exception as ex:
                title = ""

            # borrow the RSS parser's sanitizer
            _customize_sanitizer(feedparser)
            body = feedparser._sanitizeHTML(
                body, "utf-8", 'text/html')  # TODO: validate charset ??
            _customize_sanitizer(feedparser)
            title = feedparser._sanitizeHTML(
                title, "utf-8", 'text/html')  # TODO: validate charset ??
            # no other fields are ever marked as |safe in the templates

            if "banner_image" in e:
                p.image_url = e["banner_image"]

            if "image" in e:
                p.image_url = e["image"]

            try:
                p.link = e["url"]
            except Exception as ex:
                p.link = ''

            p.title = title

            try:
                p.created = pyrfc3339.parse(e["date_published"])
            except Exception as ex:
                logging.exception('Unable to parse published date.')
                p.created = timezone.now()

            p.guid = guid
            try:
                p.author = e["author"]
            except Exception as ex:
                p.author = ""

            p.save()

            try:
                seen_files = []
                for ee in list(p.enclosures.all()):
                    # check existing enclosure is still there
                    found_enclosure = False
                    if "attachments" in e:
                        for pe in e["attachments"]:
                            if pe["url"] == ee.href and ee.href not in seen_files:
                                found_enclosure = True
                                ee.length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee.type = typ
                                ee.save()
                                break

                    # DANGEROUS! This deletes everything if a glitch in the feed removes enclosures.
                    # if not found_enclosure:
                    # ee.delete()

                    seen_files.append(ee.href)

                if "attachments" in e:
                    for pe in e["attachments"]:
                        try:
                            # Since many RSS feeds embed trackers into their URL that constantly change, yet almost always only include a single enclosure,
                            # we'll only create a new enclosure when we see a new url if there are no enclosure records created yet.
                            # This is a most robust way of preventing logical duplicates due to tracker URL changes then by trying to predict and strip out
                            # all known tracker prefixes.
                            if pe["url"] not in seen_files and not p.enclosures.all(
                            ).exists():
                                length = int(
                                    pe.get("size_in_bytes", None) or 0)
                                typ = pe.get("mime_type", None) or "audio/mpeg"
                                ee = Enclosure(post=p,
                                               href=pe["url"],
                                               length=length,
                                               type=typ)
                                ee.save()
                        except Exception as ex:
                            pass
            except Exception as ex:
                logging.exception("No enclosures")

            try:
                p.body = body
                p.save()
            except Exception as ex:
                logging.exception('Unable to save body A2.')

    return (ok, changed)
def sanitize(value):
    return feedparser._sanitizeHTML(value, 'UTF-8', 'text/html')
Exemple #19
0
 def save(self, *args, **kwargs):
     if not self.texttemplate:
         self.texttemplate = html2text(_sanitizeHTML(self.htmltemplate, CHARSET))
     super(Letter, self).save(*args, **kwargs)