Example #1
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = fix_self_closing_cdata_tags(raw)  # TODO: Handle this in the parser
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)

    stream_class = partial(FastStream, track_position=line_numbers)
    stream = stream_class(raw)
    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
    while True:
        try:
            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=DataLossWarning)
                try:
                    parser.parse(stream, parseMeta=False, useChardet=False)
                finally:
                    parser.tree.proxy_cache = None
        except NamespacedHTMLPresent as err:
            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
            stream = stream_class(raw)
            continue
        break
    root = parser.tree.getDocument()
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #2
0
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
    if isinstance(raw, bytes):
        raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
    raw = fix_self_closing_cdata_tags(raw)  # TODO: Handle this in the parser
    if replace_entities:
        raw = xml_replace_entities(raw)
    if fix_newlines:
        raw = raw.replace('\r\n', '\n').replace('\r', '\n')
    raw = replace_chars.sub('', raw)

    stream_class = partial(FastStream, track_position=line_numbers)
    stream = stream_class(raw)
    builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute)
    while True:
        try:
            parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces)
            with warnings.catch_warnings():
                warnings.simplefilter('ignore', category=DataLossWarning)
                try:
                    parser.parse(stream, parseMeta=False, useChardet=False)
                finally:
                    parser.tree.proxy_cache = None
        except NamespacedHTMLPresent as err:
            raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I)
            stream = stream_class(raw)
            continue
        break
    root = parser.tree.getDocument()
    if (discard_namespaces and root.tag != 'html') or (
        not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)):
        raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
    return root
Example #3
0
        def validate_url(self, url, use_w3c=True, quite=True):
            'validate urls with the w3c validator. Need an Internet Connection'

            client = Client()
            response = client.get(url, follow=True)
            if response.status_code == 200:
                src = response.content
                treebuilder = treebuilders.getTreeBuilder("etree")
                parser = HTMLParser(tree=treebuilder, strict=True)
                try:
                    parser.parse(src)
                except Exception as ex:
                    pass

                if not parser.errors and use_w3c:
                    #uploading to w3c
                    w3c = w3c_client(src)
                    if w3c and not w3c[0]:
                        print('%s: %s' % (
                            url,
                            w3c[1],
                        ))
                        if not quite:
                            for i in w3c[2]['messages']:
                                print(i['messageid'])
                                print('\t%s' % (i['message'], ))
                        #self.assertTrue(w3c[0])
            else:
                print('skipping html check %s', (response.status_code, ))
Example #4
0
 def runValidatorTest(self, test):
     p = HTMLParser(tokenizer=HTMLConformanceChecker)
     p.parse(test['input'])
     errorCodes = [errorcode for position, errorcode, datavars in p.errors]
     if test.has_key('fail-if'):
         self.failIf(test['fail-if'] in errorCodes)
     if test.has_key('fail-unless'):
         self.failUnless(test['fail-unless'] in errorCodes)
Example #5
0
 def runValidatorTest(self, test):
     p = HTMLParser(tokenizer=HTMLConformanceChecker)
     p.parse(test['input'])
     errorCodes = [errorcode for position, errorcode, datavars in p.errors]
     if test.has_key('fail-if'):
         self.failIf(test['fail-if'] in errorCodes)
     if test.has_key('fail-unless'):
         self.failUnless(test['fail-unless'] in errorCodes)
Example #6
0
    def validate_html(self, response):
        # only import this stuff if we need it!
        from html5lib.html5parser import HTMLParser
        from html5lib.filters.validator import HTMLConformanceChecker
        import pprint
        
        p = HTMLParser()
        p.parse(response.body)

        if (p.errors and
            not (len(p.errors) == 1 and p.errors[0][1] == 'unknown-doctype')):

            lines = response.body.splitlines()
            for (line, col), error, vars in p.errors:
                print "----------------------------------"
                print "Error: %s on line %s, column %s" % (error, line, col)
                print "%5d: %s" % (line, lines[line-1])
                print "      %s^" % (" " * col,)
        
            self.assert_(False)
Example #7
0
    def validate_html(self, response):
        # only import this stuff if we need it!
        from html5lib.html5parser import HTMLParser
        from html5lib.filters.validator import HTMLConformanceChecker
        import pprint

        p = HTMLParser()
        p.parse(response.body)

        if (p.errors and not (len(p.errors) == 1
                              and p.errors[0][1] == 'unknown-doctype')):

            lines = response.body.splitlines()
            for (line, col), error, vars in p.errors:
                print "----------------------------------"
                print "Error: %s on line %s, column %s" % (error, line, col)
                print "%5d: %s" % (line, lines[line - 1])
                print "      %s^" % (" " * col, )

            self.assert_(False)
Example #8
0
def cutHtml(text, max_len):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    etree_document = parser.parse(text)
    sentinel = Sentinel(max_len)
    processItem(etree_document.getroot(), sentinel)

    if sentinel.stop:
        walker = treewalkers.getTreeWalker("lxml")
        stream = walker(etree_document.getroot().getchildren()[1])
        s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
        output_generator = s.serialize(stream)

        output = []
        for item in output_generator:
            output.append(item)
        output = output[1:-1]  # remove <body></body>
        return ''.join(output)
    return None
Example #9
0
def openLogin():
    logging.info('Loading login page...')
    s = requests.Session()
    # trigger login page
    resp = s.get('https://developer.apple.com/download/', headers={
        'User-Agent': UA
    })
    resp.raise_for_status()
    parser = HTMLParser(getTreeBuilder('dom'))
    parser = parser.parse(resp.text)
    form_input = bakeRequest(parser)
    logging.info('Logging in ...')
    resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate',
                  headers={'User-Agent': UA}, data=form_input)
    resp.raise_for_status()
    if resp.url.find('authenticate') > 0:
        raise Exception('Login failed')
    logging.info('Fetching download token...')
    resp = s.post('https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action',
                  headers={'User-Agent': UA}, data='')
    resp.raise_for_status()
    generateDownload(s.cookies.items())
Example #10
0
def openLogin():
    logging.info('Loading login page...')
    s = requests.Session()
    # trigger login page
    resp = s.get('https://developer.apple.com/download/',
                 headers={'User-Agent': UA})
    resp.raise_for_status()
    parser = HTMLParser(getTreeBuilder('dom'))
    parser = parser.parse(resp.text)
    form_input = bakeRequest(parser)
    logging.info('Logging in ...')
    resp = s.post('https://idmsa.apple.com/IDMSWebAuth/authenticate',
                  headers={'User-Agent': UA},
                  data=form_input)
    resp.raise_for_status()
    if resp.url.find('authenticate') > 0:
        raise Exception('Login failed')
    logging.info('Fetching download token...')
    resp = s.post(
        'https://developer.apple.com/services-account/QH65B2/downloadws/listDownloads.action',
        headers={'User-Agent': UA},
        data='')
    resp.raise_for_status()
    generateDownload(s.cookies.items())