Ejemplo n.º 1
0
    def test_clean_html(self):
        html_raw = dedent("""\
        <div>
           <header>this header must be removed</header>
           <p class="class_to_remove">
               <unknown_tag>bla
                   <strong>keep it strong</strong>
               </unknown_tag>
               <script>no script here !</script>
           </p>
        </div>
        """)
        elem = html.fromstring(html_raw)
        elem = sd_etree.clean_html(elem)
        expected = dedent("""\
        <div>
           this header must be removed
           <p>
               bla
                   <strong>keep it strong</strong>


           </p>
        </div>
        """)
        self.assertEqual(dedent(etree.tostring(elem, encoding="unicode")), expected)
Ejemplo n.º 2
0
    def parse_inline_content(self, tree, item, ns=NS['xhtml']):
        if tree.get('contenttype') == NITF:
            try:
                body_content = tree.xpath('.//nitf:body.content/nitf:block/*', namespaces=NS)
            except AttributeError:
                return {'contenttype': NITF, 'content': ''}
            elements = [etree.tostring(sd_etree.clean_html(e), encoding='unicode', method='html') for e in body_content]
            return {'contenttype': NITF, 'content': '\n'.join(elements)}
        else:
            html = tree.find(self.qname('html', ns))
            body = html.find(self.qname('body', ns))
            elements = []
            for elem in body:
                if elem.text:
                    tag = elem.tag.rsplit('}')[1]
                    elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

            # If there is a single p tag then replace the line feeds with breaks
            if len(elements) == 1 and body[0].tag.rsplit('}')[1] == 'p':
                elements[0] = elements[0].replace('\n    ', '</p><p>').replace('\n', '<br/>')

            content = dict()
            content['contenttype'] = tree.attrib['contenttype']
            if len(elements) > 0:
                content['content'] = "\n".join(elements)
            elif body.text:
                content['content'] = '<pre>' + body.text + '</pre>'
                content['format'] = CONTENT_TYPE.PREFORMATTED
            return content
Ejemplo n.º 3
0
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)
        # replace <pre> with <p>
        for pre in body_elt.findall('.//pre'):
            pre.tag = 'p'
        # add target blank for all links
        for a in body_elt.findall('.//a'):
            a.attrib['target'] = '_blank'

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']

        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<p>' + body_elt.text + '</p>'
            content['format'] = 'xhtml/xml'

        if content.get('content'):
            content['content'] = content['content'].replace('&lt;endash&gt;-&lt;/endash&gt;', '-')

        return content
Ejemplo n.º 4
0
    def test_clean_html(self):
        html_raw = dedent("""\
        <div>
           <header>this header must be removed</header>
           <p class="class_to_remove">
               <unknown_tag>bla
                   <strong>keep it strong</strong>
               </unknown_tag>
               <script>no script here !</script>
           </p>
        </div>
        """)
        elem = html.fromstring(html_raw)
        elem = sd_etree.clean_html(elem)
        expected = dedent("""\
        <div>
           this header must be removed
           <p>
               bla
                   <strong>keep it strong</strong>


           </p>
        </div>
        """)
        self.assertEqual(dedent(etree.tostring(elem, encoding="unicode")),
                         expected)
Ejemplo n.º 5
0
    def parse_inline_content(self, tree, item, ns=NS['xhtml']):
        if tree.get('contenttype') == NITF:
            body_content = tree.xpath('.//nitf:body.content/nitf:block/*', namespaces=NS)
            elements = [etree.tostring(sd_etree.clean_html(e), encoding='unicode', method='html') for e in body_content]
            content = {'contenttype': NITF,
                       'content': '\n'.join(elements)}
        else:
            html = tree.find(self.qname('html', ns))
            body = html.find(self.qname('body', ns))
            elements = []
            for elem in body:
                if elem.text:
                    tag = elem.tag.rsplit('}')[1]
                    elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

            # If there is a single p tag then replace the line feeds with breaks
            if len(elements) == 1 and body[0].tag.rsplit('}')[1] == 'p':
                elements[0] = elements[0].replace('\n    ', '</p><p>').replace('\n', '<br/>')

            content = dict()
            content['contenttype'] = tree.attrib['contenttype']
            if len(elements) > 0:
                content['content'] = "\n".join(elements)
            elif body.text:
                content['content'] = '<pre>' + body.text + '</pre>'
                content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
Ejemplo n.º 7
0
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname('html'))
        body_elt = html_elt.find(self.qname('body'))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            contents = [sd_etree.to_string(e, encoding='unicode', method="html") for e in body_elt]
            content['content'] = '\n'.join(contents)
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content
Ejemplo n.º 8
0
 def parse_inline_content(self, tree, item):
     try:
         body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0]
     except IndexError:
         body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0]
     body_elt = sd_etree.clean_html(body_elt)
     content = dict()
     content['contenttype'] = tree.attrib['contenttype']
     if len(body_elt) > 0:
         content['content'] = sd_etree.to_string(body_elt, method="html")
     elif body_elt.text:
         content['content'] = '<pre>' + body_elt.text + '</pre>'
         content['format'] = CONTENT_TYPE.PREFORMATTED
     return content
Ejemplo n.º 9
0
    def parse_inline_content(self, tree, item, ns=NS["xhtml"]):
        if tree.get("contenttype") == NITF:
            try:
                body_content = tree.xpath(".//nitf:body.content/nitf:block/*",
                                          namespaces=NS)
            except AttributeError:
                return {"contenttype": NITF, "content": ""}
            elements = [
                etree.tostring(sd_etree.clean_html(e),
                               encoding="unicode",
                               method="html") for e in body_content
            ]
            return {"contenttype": NITF, "content": "\n".join(elements)}
        else:
            html = tree.find(self.qname("html", ns))
            if html is None:
                try:
                    ns = tree.nsmap.get(None)  # fallback for missing xmlns
                except AttributeError:
                    ns = None
                html = tree.find(self.qname("html", ns))
            body = html.find(self.qname("body", ns))
            elements = []
            for elem in body:
                if elem.text:
                    tag = get_content_tag(elem)
                    elements.append("<%s>%s</%s>" % (tag, elem.text, tag))

            # If there is a single p tag then replace the line feeds with breaks
            if len(elements) == 1 and get_content_tag(body[0]) == "p":
                elements[0] = elements[0].replace("\n    ", "</p><p>").replace(
                    "\n", "<br/>")

            content = dict()
            content["contenttype"] = tree.attrib["contenttype"]
            if len(elements) > 0:
                content["content"] = "\n".join(elements)
            elif body.text:
                content["content"] = "<pre>" + body.text + "</pre>"
                content["format"] = CONTENT_TYPE.PREFORMATTED
            return content
Ejemplo n.º 10
0
    def parse_inline_content(self, tree, item, ns=NS['xhtml']):
        if tree.get('contenttype') == NITF:
            try:
                body_content = tree.xpath('.//nitf:body.content/nitf:block/*',
                                          namespaces=NS)
            except AttributeError:
                return {'contenttype': NITF, 'content': ''}
            elements = [
                etree.tostring(sd_etree.clean_html(e),
                               encoding='unicode',
                               method='html') for e in body_content
            ]
            return {'contenttype': NITF, 'content': '\n'.join(elements)}
        else:
            html = tree.find(self.qname('html', ns))
            if html is None:
                try:
                    ns = tree.nsmap.get(None)  # fallback for missing xmlns
                except AttributeError:
                    ns = None
                html = tree.find(self.qname('html', ns))
            body = html.find(self.qname('body', ns))
            elements = []
            for elem in body:
                if elem.text:
                    tag = get_content_tag(elem)
                    elements.append('<%s>%s</%s>' % (tag, elem.text, tag))

            # If there is a single p tag then replace the line feeds with breaks
            if len(elements) == 1 and get_content_tag(body[0]) == 'p':
                elements[0] = elements[0].replace('\n    ', '</p><p>').replace(
                    '\n', '<br/>')

            content = dict()
            content['contenttype'] = tree.attrib['contenttype']
            if len(elements) > 0:
                content['content'] = "\n".join(elements)
            elif body.text:
                content['content'] = '<pre>' + body.text + '</pre>'
                content['format'] = CONTENT_TYPE.PREFORMATTED
            return content
Ejemplo n.º 11
0
    def parse_inline_content(self, tree, item):
        html_elt = tree.find(self.qname("html"))
        body_elt = html_elt.find(self.qname("body"))
        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content["contenttype"] = tree.attrib["contenttype"]
        if len(body_elt) > 0:
            contents = [
                sd_etree.to_string(e, encoding="unicode", method="html")
                for e in body_elt
            ]
            content["content"] = "\n".join(contents)
        elif body_elt.text:
            content["content"] = "<pre>" + body_elt.text + "</pre>"
            content["format"] = CONTENT_TYPE.PREFORMATTED

        if content.get("content"):
            content["content"] = content["content"].replace(
                "&lt;endash&gt;-&lt;/endash&gt;", "-")

        return content
Ejemplo n.º 12
0
    def parse_inline_content(self, tree, item):
        try:
            body_elt = tree.xpath('//xhtml:body//xhtml:section[contains(@class,"main")]', namespaces=NS)[0]
        except IndexError:
            body_elt = tree.xpath('//xhtml:body', namespaces=NS)[0]

        try:
            notepad = self.item_tree.xpath('.//iptc:edNote[@role="dpaednoterole:notepad"]//xhtml:section',
                                           namespaces=NS)[0]
            for elem in notepad:
                body_elt.append(elem)
        except IndexError:
            pass

        body_elt = sd_etree.clean_html(body_elt)

        content = dict()
        content['contenttype'] = tree.attrib['contenttype']
        if len(body_elt) > 0:
            content['content'] = sd_etree.to_string(body_elt, method="html")
        elif body_elt.text:
            content['content'] = '<pre>' + body_elt.text + '</pre>'
            content['format'] = CONTENT_TYPE.PREFORMATTED
        return content