Beispiel #1
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = fix_self_closing_cdata_tags(raw)  # TODO: Handle this in the parser
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)

    stream_class = partial(FastStream, track_position=line_numbers)
    stream = stream_class(raw)
    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
    while True:
        try:
            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=DataLossWarning)
                try:
                    parser.parse(stream, parseMeta=False, useChardet=False)
                finally:
                    parser.tree.proxy_cache = None
        except NamespacedHTMLPresent as err:
            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
            stream = stream_class(raw)
            continue
        break
    root = parser.tree.getDocument()
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Beispiel #2
0
        def validate_url(self, url, use_w3c=True, quite=True):
            'validate urls with the w3c validator. Need an Internet Connection'

            client = Client()
            response = client.get(url, follow=True)
            if response.status_code == 200:
                src = response.content
                treebuilder = treebuilders.getTreeBuilder("etree")
                parser = HTMLParser(tree=treebuilder, strict=True)
                try:
                    parser.parse(src)
                except Exception as ex:
                    pass

                if not parser.errors and use_w3c:
                    #uploading to w3c
                    w3c = w3c_client(src)
                    if w3c and not w3c[0]:
                        print('%s: %s' % (
                            url,
                            w3c[1],
                        ))
                        if not quite:
                            for i in w3c[2]['messages']:
                                print(i['messageid'])
                                print('\t%s' % (i['message'], ))
                        #self.assertTrue(w3c[0])
            else:
                print('skipping html check %s', (response.status_code, ))
Beispiel #3
0
 def runValidatorTest(self, test):
     p = HTMLParser(tokenizer=HTMLConformanceChecker)
     p.parse(test['input'])
     errorCodes = [errorcode for position, errorcode, datavars in p.errors]
     if test.has_key('fail-if'):
         self.failIf(test['fail-if'] in errorCodes)
     if test.has_key('fail-unless'):
         self.failUnless(test['fail-unless'] in errorCodes)
Beispiel #4
0
def cutHtml(text, max_len):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    etree_document = parser.parse(text)
    sentinel = Sentinel(max_len)
    processItem(etree_document.getroot(), sentinel)

    if sentinel.stop:
        walker = treewalkers.getTreeWalker("lxml")
        stream = walker(etree_document.getroot().getchildren()[1])
        s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        output = []
        for item in output_generator:
            output.append(item)
        output = output[1:-1]  # remove <body></body>
        return ''.join(output)
    return None
Beispiel #5
0
    def validate_html(self, response):
        # only import this stuff if we need it!
        from html5lib.html5parser import HTMLParser
        from html5lib.filters.validator import HTMLConformanceChecker
        import pprint

        p = HTMLParser()
        p.parse(response.body)

        if (p.errors and not (len(p.errors) == 1
                              and p.errors[0][1] == 'unknown-doctype')):

            lines = response.body.splitlines()
            for (line, col), error, vars in p.errors:
                print "----------------------------------"
                print "Error: %s on line %s, column %s" % (error, line, col)
                print "%5d: %s" % (line, lines[line - 1])
                print "      %s^" % (" " * col, )

            self.assert_(False)
Beispiel #6
0
 def __init__(self,value,element):
   self.element=element
   self.valid = True
   self.parser = HTMLParser(strict=True)
   if value.lower().find('<?import ') >= 0:
     self.log(SecurityRisk({"parent":self.element.parent.name, "element":self.element.name, "tag":"?import"}))
   try:
     etree = self.parser.parseFragment(value)
     if self.valid:
       self.log(ValidHtml({"parent":self.element.parent.name, "element":self.element.name}))
     from pprint import pprint
     for tag in etree.iter():
       if tag.tag != "DOCUMENT_FRAGMENT":
         self.handle_tag(tag.tag.split('}')[-1], tag.attrib, tag.text)
   except ParseError as msg:
     element = self.element
     offset = [element.line - element.dispatcher.locator.getLineNumber(),
               - element.dispatcher.locator.getColumnNumber()]
     match = re.search(', at line (\d+), column (\d+)',str(msg))
     if match: offset[0] += int(match.group(1))-1
     element.log(NotHtml({"parent":element.parent.name, "element":element.name, "message":"Invalid HTML", "value": str(msg)}),offset)
Beispiel #7
0
def thumbnails(html):
    """
    Given a HTML string, converts paths in img tags to thumbnail
    paths, using Mezzanine's ``thumbnail`` template tag. Used as
    one of the default values in the ``RICHTEXT_FILTERS`` setting.
    """
    from django.conf import settings
    from html5lib.treebuilders import getTreeBuilder
    from html5lib.html5parser import HTMLParser
    #from mezzanine.core.templatetags.mezzanine_tags import thumbnail
    from asylum_custom.templatetags.asylum_tags import thumbnail

    dom = HTMLParser(tree=getTreeBuilder("dom")).parse(html)
    for img in dom.getElementsByTagName("img"):
        src = img.getAttribute("src")
        width = img.getAttribute("width")
        height = img.getAttribute("height")
        if src and width and height:
            src = settings.MEDIA_URL + thumbnail(src, width, height)
            img.setAttribute("src", src)
    nodes = dom.getElementsByTagName("body")[0].childNodes
    return "".join([node.toxml() for node in nodes])
Beispiel #8
0
def openLogin():
    logging.info('Loading login page...')
    s = requests.Session()
    # trigger login page
    resp = s.get('https://developer.apple.com/download/',
                 headers={'User-Agent': UA})
    resp.raise_for_status()
    parser = HTMLParser(getTreeBuilder('dom'))
    parser = parser.parse(resp.text)
    form_input = bakeRequest(parser)
    logging.info('Logging in ...')
    resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate',
                  headers={'User-Agent': UA},
                  data=form_input)
    resp.raise_for_status()
    if resp.url.find('authenticate') > 0:
        raise Exception('Login failed')
    logging.info('Fetching download token...')
    resp = s.post(
        'https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action',
        headers={'User-Agent': UA},
        data='')
    resp.raise_for_status()
    generateDownload(s.cookies.items())
Beispiel #9
0
def fixHtml(html):
    p = HTMLParser()
    return ''.join(
        [token.toxml() for token in p.parseFragment(html).childNodes])