def test_declaration_incomplete(self): # verify that the lenient declaration can handle incompete tags doc = " <html>A<!-- bad comment -->B</html>" # Note unrelated problem: without the initial space above, there # is problem in parsing the incomplete <html>. Investigate?! for i in range(1, len(doc)-1): chunks = [doc[:i], doc[i:]] #print chunks fp = ChunkedStringIO(chunks) tokens = hpp.generate_tokens(fp) self._test_generator1( tokens, [ (TAG, u'html', []), (DATA, u'A' ), (DATA, u'B' ), (ENDTAG, u'html'), ])
def process(fp, out, meta): """ Return has_html, has_frameset """ has_html = False has_frameset = False has_common_tag = False first_td = False # state for iterating td inside tr iterator = html_pull_parser.generate_tokens(fp) # General HTML format # <html> # <head> # <body> # # However all elements are optional. # It is better to use a flat, stateless loop to process elements for token in iterator: if token[0] == DATA: out.out(token[1]) elif token[0] == TAG: tag = token[1] id = starttag_dict.get(tag,-1) if id > 0: has_common_tag = True if id == sOUTP: out.outTag('p') elif id == sOUTTAG: out.outTag(tag) elif id == sTR: first_td = True elif id == sTDTH: if first_td: first_td = False else: out.out(' ') elif id == sINPUT: attrs = token[2] itype = _getvalue(attrs, 'type') if itype == 'checkbox': if _hasattr(attrs,'checked'): out.out('[*] ') else: out.out('[ ] ') elif itype == 'radio': if _hasattr(attrs,'checked'): out.out('(*) ') else: out.out('( ) ') elif itype == 'image': alt = _getvalue(attrs, 'alt') or _getvalue(attrs, 'value') out.outAlt(saxutils.unescape(alt)) elif itype == 'password': out.outAlt('***') elif itype == 'hidden': pass else: value = _getvalue(attrs, 'value') out.outAlt(saxutils.unescape(value)) elif id == sIMG: attrs = token[2] alt = _getvalue(attrs, 'alt') if alt: out.outAlt(saxutils.unescape(alt)) elif id == sHTML: has_html = True out.notifyHtml() elif id == sBODY: out.outHeader(meta) elif id == sFRAMESET: has_frameset = True elif id == sTITLE: title = '' for token in iterator: if token[0] == DATA: title += token[1] elif token in [ (ENDTAG, 'title'), # only </title> is valid (ENDTAG, 'head'), # in case no </title> (TAG, 'body'), # in case no </title> ]: break meta['title'] = _collapse(title) elif id == sMETA: attrs = token[2] name = _getvalue(attrs,'name').lower() content = _getvalue(attrs,'content') if name == 'description': meta['description'] = saxutils.unescape(_collapse(content)) elif name == 'keywords': meta['keywords'] = saxutils.unescape(_collapse(content)) elif id == sSCRIPT: for token in iterator: if token == (ENDTAG, 'script'): break elif id == sSTYLE: for token in iterator: if token == (ENDTAG, 'style'): break elif id == sSELECT: for token in iterator: if token == (ENDTAG, 'select'): break elif token[0] == ENDTAG: tag = token[1] id = endtag_dict.get(tag,-1) if id == eCLOSE_TAG: out.outTag('/'+tag) elif id == eBREAK_LINE: out.outTag('br') out.close(meta) return has_html, has_frameset, has_common_tag
def scan_html(fp, baseuri, append): token_stream = hpp.generate_tokens(fp, comment=True) for token in token_stream: if token[0] != hpp.TAG: continue tag = token[1] # ---------------------------------------------------------------------- # handle <style> block if tag == 'style': # HACK: <style> should only be valid inside <head> styles = [] for token in token_stream: if token[0] == hpp.DATA: styles.append(token[1]) elif token[0] == hpp.COMMENT: styles.append(token[1]) # CSS enclosed by HTML comment! elif token[0] == hpp.TAG: if token[1] == 'style': # <style> follows by <style>??? OK, treat it as <style>. continue else: # No </style>??? Any other open tags would close <style> break elif token[0] == hpp.ENDTAG: break else: # the stream is exhausted? Make sure next step knows # there is no unprocessed token. token = None # process the style content _scan_html_style(tag, ''.join(styles), baseuri, append) # look at last unprecessed token if not token: # TODO: test break elif token[0] == hpp.ENDTAG: # hopefully this ends with a valid </style> # TODO: test continue else: # we got a (non-style) begin TAG??? OK, we'll process this tag. tag = token[1] # TODO: test # ---------------------------------------------------------------------- # read TAG and its attributes isLinkTag = (tag == 'link') uri_attr = LINKABLE_TAGS.get(tag,'') # run through attribute list to find relevant info uri = style = rel = ctype = '' for n, v in token[2]: # TODO: need to XML decode? if n == uri_attr: uri = v elif n == 'style': style = v elif isLinkTag: if n == 'rel': rel = v elif n == 'type': ctype = v # ---------------------------------------------------------------------- # handle style attribute if style: _scan_html_style(tag, style, baseuri, append) # ---------------------------------------------------------------------- # handle uri attributes (href, src, etc) if not uri: continue #print >>sys.stderr, tag, uri, ctype # note: the ctype is advisory, content-type from http may not be consistent # everything not HTML or CSS is APPLICATION if tag in ['frame','iframe']: ctype = TEXT_HTML elif isLinkTag: if rel.lower() == 'stylesheet': # TODO: rel="alternate stylesheet"? ctype = TEXT_CSS elif ctype != TEXT_CSS: # only want CSS from <link> continue else: ctype = APPLICATION append(baseuri, uri, ctype, tag)
def _test_generator(self, doc, expect, **args): fp = StringIO.StringIO(doc) tokens = hpp.generate_tokens(fp, **args) self._test_generator1(tokens, expect)
def test_0(self): tlist = list(hpp.generate_tokens(StringIO.StringIO("å"))) self.assertEqual(tlist, [(DATA, u'\u00e5')])