Example #1
0
    def process_text(self, text):
        try:
            tree = fromstring(clean_html(text))
        except etree.XMLSyntaxError:
            return xhtmlim(text)

        self._parse_element(tree)
        return html2md(etree.tostring(tree, method="xml", encoding=unicode))
Example #2
0
    def process_text(self, text):
        try:
            tree = fromstring(clean_html(text))
        except etree.XMLSyntaxError:
            return xhtmlim(text)

        for el in tree.find_class('tags'):
            el.drop_tree()

        self._parse_element(tree)
        return html2md(etree.tostring(tree, method="xml", encoding=unicode))
Example #3
0
 def process_node_body(self, html):
     try:
         html = autolink(html_cleaner.clean_html(html))
         
         #To stay on the safe side escape all % characters
         html = html.replace(u"%", u"%%")
         doc = filter_style(fromstring(html))
         
         for el in doc.iter(u'img'):
             src = el.get(u"src").replace(u"%%", u"%")
             if not src:
                 el.getparent().remove(el)
             
             if src.startswith(djsettings.APP_URL):
                 src.replace(djsettings.APP_URL, u"/", 1)
             
             if src.startswith(djsettings.MEDIA_URL):                
                 elem_id = el.get(u"id")
                 try:
                     if elem_id and elem_id.startswith(self.image_id_prefix):                        
                         image_id = int(elem_id.replace(self.image_id_prefix, ""))
                         image = Image.objects.get(id=image_id)
                         self.existing_images.append(image)
                         self.set_image_attributes(el, image)                        
                         continue
                 except:
                     logger.error(u'Malformed id (%s)found on our own img url %s' % (elem_id, src))
                 
                 try:
                     image = Image.objects.get(image=src.replace(djsettings.MEDIA_URL,u""))
                     self.set_image_attributes(el, image)
                     self.existing_images.append(image)                    
                     continue
                 except:
                     logger.error(u'Unable to locate img stored under url %s in Image table' % src)
                     if src.startswith(u"/"):
                         src = src.replace(u"/", djsettings.APP_URL, 1)         
 
             image_file = download_image_file(src)
             image = Image(image=image_file, upload_url=src)
             image.save()
             self.new_images.append(image)
             self.set_image_attributes(el, image)
         
         return etree.tounicode(doc, method="html")
     except ValidationError:
         for image in self.new_images:
             image.delete()
         raise
     except Exception:
         logger.exception(u'Unhandled exception while parsing "%s" body' % html)
         for image in self.new_images:
             image.delete()
         raise ValidationError(_(u"Unexpected error happened :("))
Example #4
0
    def process_entry(self, entry):
        out = super(LorFeedProcessor, self).process_entry(entry)

        out['tags'] = []

        tree = fromstring(entry['summary'])

        for el in tree.find_class('tags'):
            for t in el.find_class('tag'):
                out['tags'].append(t.text.strip())

        return out
Example #5
0
    def process_text(self, text):
        try:
            tree = fromstring(clean_html(text))
        except etree.XMLSyntaxError:
            return xhtmlim(text)

        rm = False
        for el in tree.iterchildren():
            if el.tag.lower() == 'a' and el.get('name').lower().startswith('cutid'):
                rm = True

            if rm:
                el.drop_tree()

        self._parse_element(tree)
        return html2md(etree.tostring(tree, method="xml", encoding=unicode))