Exemple #1
0
def runSanitizerTest(name, expected, input):
    expected = ''.join([
        token.toxml() for token in html5parser.HTMLParser().parseFragment(
            expected).childNodes
    ])
    expected = json.loads(json.dumps(expected))
    assert expected == sanitize_html(input)
 def test_all_tokens(self):
     expected = [{
         'data': {},
         'type': 'StartTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'html'
     }, {
         'data': {},
         'type': 'StartTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'head'
     }, {
         'data': {},
         'type': 'EndTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'head'
     }, {
         'data': {},
         'type': 'StartTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'body'
     }, {
         'data': 'a',
         'type': 'Characters'
     }, {
         'data': {},
         'type': 'StartTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'div'
     }, {
         'data': 'b',
         'type': 'Characters'
     }, {
         'data': {},
         'type': 'EndTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'div'
     }, {
         'data': 'c',
         'type': 'Characters'
     }, {
         'data': {},
         'type': 'EndTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'body'
     }, {
         'data': {},
         'type': 'EndTag',
         'namespace': 'http://www.w3.org/1999/xhtml',
         'name': 'html'
     }]
     for treeName, treeCls in treeTypes.items():
         p = html5parser.HTMLParser(tree=treeCls["builder"])
         document = p.parse(
             "<html><head></head><body>a<div>b</div>c</body></html>")
         document = treeCls.get("adapter", lambda x: x)(document)
         output = treeCls["walker"](document)
         for expectedToken, outputToken in zip(expected, output):
             self.assertEqual(expectedToken, outputToken)
Exemple #3
0
def content(xentry, name, detail, bozo):
    """ insert a content-like element into the entry """
    if not detail or not detail.value: return

    data = None
    xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
    xdoc = xentry.ownerDocument
    xcontent = xdoc.createElement(name)

    if isinstance(detail.value, unicode):
        detail.value = detail.value.encode('utf-8')

    if not detail.has_key('type') or detail.type.lower().find('html') < 0:
        detail['value'] = escape(detail.value)
        detail['type'] = 'text/html'

    if detail.type.find('xhtml') >= 0 and not bozo:
        try:
            data = minidom.parseString(xdiv % detail.value).documentElement
            xcontent.setAttribute('type', 'xhtml')
        except:
            bozo = 1

    if detail.type.find('xhtml') < 0 or bozo:
        parser = html5parser.HTMLParser(
            tree=treebuilders.getTreeBuilder('dom'))
        html = parser.parse(xdiv % detail.value, encoding="utf-8")
        for body in html.documentElement.childNodes:
            if body.nodeType != Node.ELEMENT_NODE: continue
            if body.nodeName != 'body': continue
            for div in body.childNodes:
                if div.nodeType != Node.ELEMENT_NODE: continue
                if div.nodeName != 'div': continue
                try:
                    div.normalize()
                    if len(div.childNodes) == 1 and \
                        div.firstChild.nodeType == Node.TEXT_NODE:
                        data = div.firstChild
                        if illegal_xml_chars.search(data.data):
                            data = xdoc.createTextNode(
                                illegal_xml_chars.sub(invalidate, data.data))
                    else:
                        data = div
                        xcontent.setAttribute('type', 'xhtml')
                    break
                except:
                    # in extremely nested cases, the Python runtime decides
                    # that normalize() must be in an infinite loop; mark
                    # the content as escaped html and proceed on...
                    xcontent.setAttribute('type', 'html')
                    data = xdoc.createTextNode(detail.value.decode('utf-8'))

    if data: xcontent.appendChild(data)

    if detail.get("language"):
        xcontent.setAttribute('xml:lang', detail.language)

    xentry.appendChild(xcontent)
Exemple #4
0
    def runtest(self):
        if self.treeAPIs is None:
            pytest.skip("Treebuilder not loaded")

        p = html5parser.HTMLParser(tree=self.treeAPIs["builder"])

        input = self.test['data']
        fragmentContainer = self.test['document-fragment']
        expected = convertExpected(self.test['document'])

        scripting = False
        if 'script-on' in self.test:
            scripting = True

        with warnings.catch_warnings():
            warnings.simplefilter("error")
            try:
                if fragmentContainer:
                    document = p.parseFragment(input,
                                               fragmentContainer,
                                               scripting=scripting)
                else:
                    document = p.parse(input, scripting=scripting)
            except constants.DataLossWarning:
                pytest.skip("data loss warning")

        poutput = convertTreeDump(p.tree.testSerializer(document))
        namespace_expected = namespaceExpected(r"\1<html \2>", expected)
        if poutput != namespace_expected:
            pytest.skip("parser output incorrect")

        document = self.treeAPIs.get("adapter", lambda x: x)(document)

        try:
            output = treewalkers.pprint(Lint(
                self.treeAPIs["walker"](document)))
            output = sortattrs(output)
            expected = sortattrs(expected)
            diff = "".join(
                unified_diff([line + "\n" for line in expected.splitlines()],
                             [line + "\n" for line in output.splitlines()],
                             "Expected", "Received"))
            assert expected == output, "\n".join([
                "",
                "Input:",
                input,
                "",
                "Expected:",
                expected,
                "",
                "Received:",
                output,
                "",
                "Diff:",
                diff,
            ])
        except NotImplementedError:
            pytest.skip("tree walker NotImplementedError")
Exemple #5
0
def runSanitizerTest(name, expected, input, toxml=None):
    if toxml is None:
        toxml = toxmlFactory()
    expected = ''.join([
        toxml(token)
        for token in html5parser.HTMLParser().parseFragment(expected)
    ])
    expected = json.loads(json.dumps(expected))
    assert expected == sanitize_html(input)
Exemple #6
0
    def runtest(self):
        if self.treeClass is None:
            pytest.skip("Treebuilder not loaded")

        p = html5parser.HTMLParser(
            tree=self.treeClass,
            namespaceHTMLElements=self.namespaceHTMLElements)

        input = self.test['data']
        fragmentContainer = self.test['document-fragment']
        expected = convertExpected(self.test['document'])
        expectedErrors = self.test['errors'].split(
            "\n") if self.test['errors'] else []

        scripting = False
        if 'script-on' in self.test:
            scripting = True

        with warnings.catch_warnings():
            warnings.simplefilter("error")
            try:
                if fragmentContainer:
                    document = p.parseFragment(input,
                                               fragmentContainer,
                                               scripting=scripting)
                else:
                    document = p.parse(input, scripting=scripting)
            except constants.DataLossWarning:
                pytest.skip("data loss warning")

        output = convertTreeDump(p.tree.testSerializer(document))

        expected = expected
        if self.namespaceHTMLElements:
            expected = namespaceExpected(r"\1<html \2>", expected)

        errorMsg = "\n".join([
            "\n\nInput:", input, "\nExpected:", expected, "\nReceived:", output
        ])
        assert expected == output, errorMsg

        errStr = []
        for (line, col), errorcode, datavars in p.errors:
            assert isinstance(datavars,
                              dict), "%s, %s" % (errorcode, repr(datavars))
            errStr.append("Line: %i Col: %i %s" %
                          (line, col, constants.E[errorcode] % datavars))

        errorMsg2 = "\n".join([
            "\n\nInput:", input, "\nExpected errors (" +
            str(len(expectedErrors)) + "):\n" + "\n".join(expectedErrors),
            "\nActual errors (" + str(len(p.errors)) + "):\n" +
            "\n".join(errStr)
        ])
        if False:  # we're currently not testing parse errors
            assert len(p.errors) == len(expectedErrors), errorMsg2
Exemple #7
0
def runParserTest(innerHTML, input, expected, errors, treeClass,
                  namespaceHTMLElements):
    #XXX - move this out into the setup function
    #concatenate all consecutive character tokens into a single token
    try:
        p = html5parser.HTMLParser(tree = treeClass,
                                   namespaceHTMLElements=namespaceHTMLElements)
    except constants.DataLossWarning:
        return

    try:
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            try:
                document = p.parse(input)
            except constants.DataLossWarning:
                return 
    except:
        errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                               u"\nTraceback:", traceback.format_exc()])
        assert False, errorMsg.encode("utf8")

    output = convertTreeDump(p.tree.testSerializer(document))

    expected = convertExpected(expected)
    if namespaceHTMLElements:
        expected = namespaceExpected(r"\1<html \2>", expected)

    errorMsg = u"\n".join([u"\n\nInput:", input, u"\nExpected:", expected,
                           u"\nReceived:", output])
    assert expected == output, errorMsg.encode("utf8")
    # errStr = [u"Line: %i Col: %i %s"%(line, col,
    #                                   constants.E[errorcode] % datavars if isinstance(datavars, dict) else (datavars,)) for
    #           ((line,col), errorcode, datavars) in p.errors]

    def datavars_sub(datavars, errorcode):
        if isinstance(datavars, dict):
            return datavars
        else:
            errstr = constants.E[errorcode]
            tgt = re.compile("(\%\(\w*\)s)")
            r = tgt.search(errstr)
            d = {}
            for i,g in enumerate(r.groups()):
                d[g[2:-2]] = datavars[i]
            return d
    errStr = [u"Line: %i Col: %i %s"%(line, col,
                                      constants.E[errorcode] % datavars_sub(datavars, errorcode)) for
              ((line,col), errorcode, datavars) in p.errors]

    errorMsg2 = u"\n".join([u"\n\nInput:", input,
                            u"\nExpected errors (" + str(len(errors)) + u"):\n" + u"\n".join(errors),
                            u"\nActual errors (" + str(len(p.errors)) + u"):\n" + u"\n".join(errStr)])
    if checkParseErrors:
            assert len(p.errors) == len(errors), errorMsg2.encode("utf-8")
Exemple #8
0
def favicon(page):
    parser = html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(urlopen(page))
    favicon = urljoin(page, '/favicon.ico')
    for link in doc.getElementsByTagName('link'):
        if link.hasAttribute('rel') and link.hasAttribute('href'):
            if 'icon' in link.attributes['rel'].value.lower().split(' '):
                favicon = urljoin(page, link.attributes['href'].value)
    if urlopen(favicon).info()['content-length'] != '0':
        return favicon
def parse(text):
    
    # First run through the Markdown parser
    text = markdown.markdown(text, extensions=["extra"], safe_mode=False)
    
    # Sanitize using html5lib
    bits = []
    parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
    for token in parser.parseFragment(text).childNodes:
        bits.append(token.toxml())
    return "".join(bits)
Exemple #10
0
def runParserTest(innerHTML, input, expected, errors, treeClass,
                  namespaceHTMLElements):
    with warnings.catch_warnings(record=True) as caughtWarnings:
        warnings.simplefilter('always')
        p = html5parser.HTMLParser(tree=treeClass,
                                   namespaceHTMLElements=namespaceHTMLElements)

        try:
            if innerHTML:
                document = p.parseFragment(input, innerHTML)
            else:
                document = p.parse(input)
        except:
            errorMsg = '\n'.join([
                '\n\nInput:', input, '\nExpected:', expected, '\nTraceback:',
                traceback.format_exc()
            ])
            assert False, errorMsg

    otherWarnings = [
        x for x in caughtWarnings
        if not issubclass(x.category, constants.DataLossWarning)
    ]
    assert len(otherWarnings) == 0, [(x.category, x.message)
                                     for x in otherWarnings]
    if len(caughtWarnings):
        return

    output = convertTreeDump(p.tree.testSerializer(document))

    expected = convertExpected(expected)
    if namespaceHTMLElements:
        expected = namespaceExpected(r'\1<html \2>', expected)

    errorMsg = '\n'.join(
        ['\n\nInput:', input, '\nExpected:', expected, '\nReceived:', output])
    assert expected == output, errorMsg

    errStr = []
    for (line, col), errorcode, datavars in p.errors:
        assert isinstance(datavars,
                          dict), '%s, %s' % (errorcode, repr(datavars))
        errStr.append('Line: %i Col: %i %s' %
                      (line, col, constants.E[errorcode] % datavars))

    errorMsg2 = '\n'.join([
        '\n\nInput:', input,
        '\nExpected errors (' + str(len(errors)) + '):\n' + '\n'.join(errors),
        '\nActual errors (' + str(len(p.errors)) + '):\n' + '\n'.join(errStr)
    ])
    if checkParseErrors:
        assert len(p.errors) == len(errors), errorMsg2
Exemple #11
0
def test_fragment_single_char(tree, char):
    expected = [{'data': char, 'type': 'Characters'}]

    treeName, treeClass = tree
    if treeClass is None:
        pytest.skip("Treebuilder not loaded")

    parser = html5parser.HTMLParser(tree=treeClass["builder"])
    document = parser.parseFragment(char)
    document = treeClass.get("adapter", lambda x: x)(document)
    output = Lint(treeClass["walker"](document))

    assert list(output) == expected
def runTreewalkerEditTest(intext, expected, attrs_to_add, tree):
    """tests what happens when we add attributes to the intext"""
    treeName, treeClass = tree
    parser = html5parser.HTMLParser(tree=treeClass["builder"])
    document = parser.parseFragment(intext)
    for nom, val in attrs_to_add:
        set_attribute_on_first_child(document, nom, val, treeName)

    document = treeClass.get("adapter", lambda x: x)(document)
    output = treewalkers.pprint(treeClass["walker"](document))
    output = attrlist.sub(sortattrs, output)
    if not output in expected:
        raise AssertionError("TreewalkerEditTest: %s\nExpected:\n%s\nReceived:\n%s" % (treeName, expected, output))
def runParserTest(innerHTML, input, expected, errors, treeClass,
                  namespaceHTMLElements):
    warnings.resetwarnings()
    warnings.simplefilter(u"error")
    #XXX - move this out into the setup function
    #concatenate all consecutive character tokens into a single token
    try:
        p = html5parser.HTMLParser(tree=treeClass,
                                   namespaceHTMLElements=namespaceHTMLElements)
    except constants.DataLossWarning:
        return

    try:
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            try:
                document = p.parse(input)
            except constants.DataLossWarning:
                return
    except:
        errorMsg = u"\n".join([
            u"\n\nInput:", input, u"\nExpected:", expected, u"\nTraceback:",
            traceback.format_exc()
        ])
        assert False, errorMsg

    output = convertTreeDump(p.tree.testSerializer(document))

    expected = convertExpected(expected)
    if namespaceHTMLElements:
        expected = namespaceExpected(ur"\1<html \2>", expected)

    errorMsg = u"\n".join([
        u"\n\nInput:", input, u"\nExpected:", expected, u"\nReceived:", output
    ])
    assert expected == output, errorMsg
    errStr = [
        u"Line: %i Col: %i %s" % (line, col, constants.E[errorcode] %
                                  datavars if isinstance(datavars, dict) else
                                  (datavars, ))
        for ((line, col), errorcode, datavars) in p.errors
    ]

    errorMsg2 = u"\n".join([
        u"\n\nInput:", input, u"\nExpected errors (" + unicode(len(errors)) +
        u"):\n" + u"\n".join(errors), u"\nActual errors (" +
        unicode(len(p.errors)) + u"):\n" + u"\n".join(errStr)
    ])
    if checkParseErrors:
        assert len(p.errors) == len(errors), errorMsg2
Exemple #14
0
def parse(text):

    sanitizer.HTMLSanitizer.allowed_elements.extend(['iframe'])
    sanitizer.HTMLSanitizer.allowed_attributes.extend(
        ['scrolling', 'allowfullscreen', 'frameborder'])

    # First run through the Markdown parser
    text = markdown.markdown(text, extensions=["extra"], safe_mode=False)

    # Sanitize using html5lib
    bits = []
    parser = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
                                    tree=getTreeBuilder("dom"))
    for token in parser.parseFragment(text).childNodes:
        bits.append(token.toxml())
    return "".join(bits)
Exemple #15
0
    def runParserTest(self, innerHTML, input, expected, errors, treeClass):
        #XXX - move this out into the setup function
        #concatenate all consecutive character tokens into a single token
        p = html5parser.HTMLParser(tree = treeClass)

        if innerHTML:
            innerHTML = str(innerHTML, "utf8")

        if errors:
            errors = str(errors, "utf8")
            errors = errors.split("\n")

        expected = str(expected, "utf8")

        try:
            if innerHTML:
                document = p.parseFragment(io.BytesIO(input), innerHTML)
            else:
                try:
                    document = p.parse(io.BytesIO(input))
                except constants.DataLossWarning:
                    sys.stderr.write("Test input causes known dataloss, skipping")
                    return 
        except:
            errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), 
                                  "\nExpected:", expected,
                                  "\nTraceback:", traceback.format_exc()])
            self.assertTrue(False, errorMsg)
        
        output = convertTreeDump(p.tree.testSerializer(document))
        output = attrlist.sub(sortattrs, output)
        
        expected = convertExpected(expected)
        expected = attrlist.sub(sortattrs, expected)
        errorMsg = "\n".join(["\n\nInput:", str(input, "utf8"), 
                              "\nExpected:", expected,
                              "\nReceived:", output])
        self.assertEquals(expected, output, errorMsg)
        errStr = ["Line: %i Col: %i %s %s"%(line, col, 
                                         constants.E[errorcode], datavars) for
                  ((line,col), errorcode, datavars) in p.errors]
        errorMsg2 = "\n".join(["\n\nInput:", str(input, "utf8"),
                               "\nExpected errors (" + str(len(errors)) + "):\n" + "\n".join(errors),
                               "\nActual errors (" + str(len(p.errors)) + "):\n" + "\n".join(errStr)])
        if checkParseErrors:
            self.assertEquals(len(p.errors), len(errors), errorMsg2)
Exemple #16
0
def parse():
    optParser = getOptParser()
    opts, args = optParser.parse_args()
    encoding = None

    try:
        f = args[-1]
        # Try opening from the internet
        if f.startswith('http://'):
            try:
                import urllib, cgi
                f = urllib.urlopen(f)
                contentType = f.headers.get('content-type')
                if contentType:
                    (mediaType, params) = cgi.parse_header(contentType)
                    encoding = params.get('charset')
            except:
                pass
        elif f == '-':
            f = sys.stdin
        else:
            try:
                # Try opening from file system
                f = open(f)
            except IOError:
                pass
    except IndexError:
        sys.stderr.write("No filename provided. Use -h for help\n")
        sys.exit(1)

    treebuilder = treebuilders.getTreeBuilder("simpleTree")

    #    if opts.xml:
    #        p = liberalxmlparser.XHTMLParser(tree=treebuilder)
    #    else:
    if 1:
        p = html5parser.HTMLParser(tree=treebuilder,
                                   tokenizer=validator.HTMLConformanceChecker)

    document = p.parse(f, encoding=encoding)
    printOutput(p, document, opts)
Exemple #17
0
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
    try:
        p = html5parser.HTMLParser(tree=treeClass["builder"])
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            document = p.parse(input)
    except constants.DataLossWarning:
        #Ignore testcases we know we don't pass
        return

    document = treeClass.get("adapter", lambda x: x)(document)
    try:
        output = convertTokens(treeClass["walker"](document))
        output = attrlist.sub(sortattrs, output)
        expected = attrlist.sub(sortattrs, convertExpected(expected))
        assert expected == output, "\n".join([
            "", "Input:", input, "", "Expected:", expected, "", "Received:",
            output
        ])
    except NotImplementedError:
        pass  # Amnesty for those that confess...
Exemple #18
0
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
    warnings.resetwarnings()
    warnings.simplefilter('error')
    try:
        p = html5parser.HTMLParser(tree=treeClass['builder'])
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            document = p.parse(input)
    except constants.DataLossWarning:
        # Ignore testcases we know we don't pass
        return

    document = treeClass.get('adapter', lambda x: x)(document)
    try:
        output = treewalkers.pprint(treeClass['walker'](document))
        output = attrlist.sub(sortattrs, output)
        expected = attrlist.sub(sortattrs, convertExpected(expected))
        diff = ''.join(
            unified_diff([line + '\n' for line in expected.splitlines()],
                         [line + '\n' for line in output.splitlines()],
                         'Expected', 'Received'))
        assert expected == output, '\n'.join([
            '',
            'Input:',
            input,
            '',
            'Expected:',
            expected,
            '',
            'Received:',
            output,
            '',
            'Diff:',
            diff,
        ])
    except NotImplementedError:
        pass  # Amnesty for those that confess...
def runTreewalkerTest(innerHTML, input, expected, errors, treeClass):
    warnings.resetwarnings()
    warnings.simplefilter("error")
    try:
        p = html5parser.HTMLParser(tree=treeClass["builder"])
        if innerHTML:
            document = p.parseFragment(input, innerHTML)
        else:
            document = p.parse(input)
    except constants.DataLossWarning:
        # Ignore testcases we know we don't pass
        return

    document = treeClass.get("adapter", lambda x: x)(document)
    try:
        output = convertTokens(treeClass["walker"](document))
        output = attrlist.sub(sortattrs, output)
        expected = attrlist.sub(sortattrs, convertExpected(expected))
        diff = "".join(
            unified_diff([line + "\n" for line in expected.splitlines()],
                         [line + "\n" for line in output.splitlines()],
                         "Expected", "Received"))
        assert expected == output, "\n".join([
            "",
            "Input:",
            input,
            "",
            "Expected:",
            expected,
            "",
            "Received:",
            output,
            "",
            "Diff:",
            diff,
        ])
    except NotImplementedError:
        pass  # Amnesty for those that confess...
Exemple #20
0
 def sanitize_html(self, stream):
     return ''.join([
         token.toxml() for token in html5parser.HTMLParser(
             tokenizer=sanitizer.HTMLSanitizer).parseFragment(
                 stream).childNodes
     ])
Exemple #21
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang') >= 0: tag = 'language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html') >= 0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        for entry in data.entries:
            if entry.has_key('published_parsed') and entry['published_parsed']:
                if entry['published_parsed'] > now:
                    del entry['published_parsed']
                    del entry['published']
            if entry.has_key('updated_parsed') and entry['updated_parsed']:
                if entry['updated_parsed'] > now:
                    del entry['updated_parsed']
                    del entry['updated']
    elif future_dates == 'ignore_entry':
        now = time.time()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        data.entries = [
            entry for entry in data.entries
            if (not entry.has_key('published_parsed')
                or not entry['published_parsed']
                or entry['published_parsed'] <= now) and
            (not entry.has_key('updated_parsed') or not entry['updated_parsed']
             or entry['updated_parsed'] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content' and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            if node['value']:
                # Run this through HTML5's sanitizer
                doc = None
                if 'xhtml' in node['type']:
                    try:
                        from xml.dom import minidom
                        doc = minidom.parseString(node['value'])
                    except:
                        node['type'] = 'text/html'

                if not doc:
                    from html5lib import html5parser, treebuilders, sanitizer
                    p = html5parser.HTMLParser(
                        tree=treebuilders.getTreeBuilder('dom'),
                        tokenizer=sanitizer.HTMLSanitizer)
                    doc = p.parseFragment(node['value'], encoding='utf-8')

                from html5lib import treewalkers, serializer
                walker = treewalkers.getTreeWalker('dom')(doc)
                xhtml = serializer.HTMLSerializer(inject_meta_charset=False)
                tree = xhtml.serialize(walker, encoding='utf-8')
                node['value'] = ''.join([str(token) for token in tree])
Exemple #22
0
 def test_unicode_file(self):
     parser = html5parser.HTMLParser()
     parser.parse(io.StringIO("a"))
Exemple #23
0
 def test_namespace_html_elements_1_etree(self):
     parser = html5parser.HTMLParser(namespaceHTMLElements=False)
     doc = parser.parse("<html></html>")
     self.assertTrue(doc.tag == "html")
Exemple #24
0
 def test_namespace_html_elements_0_etree(self):
     parser = html5parser.HTMLParser(namespaceHTMLElements=True)
     doc = parser.parse("<html></html>")
     self.assertTrue(doc.tag == "{%s}html" % (namespaces["html"],))
Exemple #25
0
 def test_namespace_html_elements_1_dom(self):
     parser = html5parser.HTMLParser(tree=self.dom_tree, namespaceHTMLElements=False)
     doc = parser.parse("<html></html>")
     self.assertTrue(doc.childNodes[0].namespaceURI is None)
Exemple #26
0
 def test_line_counter(self):
     # http://groups.google.com/group/html5lib-discuss/browse_frm/thread/f4f00e4a2f26d5c0
     parser = html5parser.HTMLParser(tree=self.dom_tree)
     parser.parse("<pre>\nx\n&gt;\n</pre>")
Exemple #27
0
 def test_assertDoctypeCloneable(self):
     parser = html5parser.HTMLParser(tree=self.dom_tree)
     doc = parser.parse('<!DOCTYPE HTML>')
     self.assertTrue(doc.cloneNode(True))
 def test_namespace_html_elements_0(self): 
   parser = html5parser.HTMLParser(namespaceHTMLElements=True)
   doc = parser.parse(u"<html></html>")
   self.assert_(doc.childNodes[0].namespace == namespaces[u"html"])
 def test_namespace_html_elements_1(self): 
   parser = html5parser.HTMLParser(namespaceHTMLElements=False)
   doc = parser.parse(u"<html></html>")
   self.assert_(doc.childNodes[0].namespace == None)
Exemple #30
0
def parse():
    optParser = getOptParser()
    opts, args = optParser.parse_args()
    encoding = "utf8"

    try:
        f = args[-1]
        # Try opening from the internet
        if f.startswith('http://'):
            try:
                import urllib.request
                import urllib.parse
                import urllib.error
                import cgi
                f = urllib.request.urlopen(f)
                contentType = f.headers.get('content-type')
                if contentType:
                    (mediaType, params) = cgi.parse_header(contentType)
                    encoding = params.get('charset')
            except:
                pass
        elif f == '-':
            f = sys.stdin
            if sys.version_info[0] >= 3:
                encoding = None
        else:
            try:
                # Try opening from file system
                f = open(f, "rb")
            except IOError as e:
                sys.stderr.write("Unable to open file: %s\n" % e)
                sys.exit(1)
    except IndexError:
        sys.stderr.write("No filename provided. Use -h for help\n")
        sys.exit(1)

    treebuilder = treebuilders.getTreeBuilder(opts.treebuilder)

    p = html5parser.HTMLParser(tree=treebuilder, debug=opts.log)

    if opts.fragment:
        parseMethod = p.parseFragment
    else:
        parseMethod = p.parse

    if opts.profile:
        import cProfile
        import pstats
        cProfile.runctx("run(parseMethod, f, encoding, scripting)", None,
                        {"run": run,
                         "parseMethod": parseMethod,
                         "f": f,
                         "encoding": encoding,
                         "scripting": opts.scripting},
                        "stats.prof")
        # XXX - We should use a temp file here
        stats = pstats.Stats('stats.prof')
        stats.strip_dirs()
        stats.sort_stats('time')
        stats.print_stats()
    elif opts.time:
        import time
        t0 = time.time()
        document = run(parseMethod, f, encoding, opts.scripting)
        t1 = time.time()
        if document:
            printOutput(p, document, opts)
            t2 = time.time()
            sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)" % (t1 - t0, t2 - t1))
        else:
            sys.stderr.write("\n\nRun took: %fs" % (t1 - t0))
    else:
        document = run(parseMethod, f, encoding, opts.scripting)
        if document:
            printOutput(p, document, opts)