def findMetaHttpEquiv(first_block): fp = StringIO.StringIO(first_block) try: for token in generator_parser.generate_tokens(fp): if token[:2] == (generator_parser.TAG, 'meta'): http_equiv = _getvalue(token[2],'http-equiv').lower() if http_equiv == 'content-type': attrs = token[2] return findCharSet(_getvalue(attrs,'content')) except sgmllib.SGMLParseError, e: log.warn('Error looking for <meta> encoding "%s"', str(e)) # ParseError not uncommon, just log
def test_declaration_incomplete(self): # verify that the lenient declaration can handle incompete tags doc = " <html>A<!-- bad comment -->B</html>" # Note unrelated problem: without the initial space above, there # is problem in parsing the incomplete <html>. Investigate?! for i in range(1, len(doc)-1): chunks = [doc[:i], doc[i:]] #print chunks fp = ChunkedStringIO(chunks) tokens = gp.generate_tokens(fp) self._test_generator1( tokens, [ (gp.TAG, u'html', []), (gp.DATA, u'A' ), (gp.DATA, u'B' ), (gp.ENDTAG, u'html'), ])
def process(fp, out, meta): """ Return has_html, has_frameset """ has_html = False has_frameset = False has_common_tag = False first_td = False # state for iterating td inside tr iterator = generator_parser.generate_tokens(fp) # General HTML format # <html> # <head> # <body> # # However all elements are optional. # It is better to use a flat, stateless loop to process elements for token in iterator: if token[0] == DATA: out.out(token[1]) elif token[0] == TAG: tag = token[1] id = starttag_dict.get(tag,-1) if id > 0: has_common_tag = True if id == sOUTP: out.outTag('p') elif id == sOUTTAG: out.outTag(tag) elif id == sTR: first_td = True elif id == sTDTH: if first_td: first_td = False else: out.out(' ') elif id == sINPUT: attrs = token[2] itype = _getvalue(attrs, 'type') if itype == 'checkbox': if _hasattr(attrs,'checked'): out.out('[*] ') else: out.out('[ ] ') elif itype == 'radio': if _hasattr(attrs,'checked'): out.out('(*) ') else: out.out('( ) ') elif itype == 'image': alt = _getvalue(attrs, 'alt') or _getvalue(attrs, 'value') out.outAlt(saxutils.unescape(alt)) elif itype == 'password': out.outAlt('***') elif itype == 'hidden': pass else: value = _getvalue(attrs, 'value') out.outAlt(saxutils.unescape(value)) elif id == sIMG: attrs = token[2] alt = _getvalue(attrs, 'alt') if alt: out.outAlt(saxutils.unescape(alt)) elif id == sHTML: has_html = True out.notifyHtml() elif id == sBODY: out.outHeader(meta) elif id == sFRAMESET: has_frameset = True elif id == sTITLE: title = '' for token in iterator: if token[0] == DATA: title += token[1] elif token in [ (ENDTAG, 'title'), # only </title> is valid (ENDTAG, 'head'), # in case no </title> (TAG, 'body'), # in case no </title> ]: break meta['title'] = _collapse(title) elif id == sMETA: attrs = token[2] name = _getvalue(attrs,'name').lower() content = _getvalue(attrs,'content') if name == 'description': meta['description'] = saxutils.unescape(_collapse(content)) elif name == 'keywords': meta['keywords'] = saxutils.unescape(_collapse(content)) elif id == sSCRIPT: for token in iterator: if token == (ENDTAG, 'script'): break elif id == sSTYLE: for token in iterator: if token == (ENDTAG, 'style'): break elif id == sSELECT: for token in iterator: if token == (ENDTAG, 'select'): break elif token[0] == ENDTAG: tag = token[1] id = endtag_dict.get(tag,-1) if id == eCLOSE_TAG: out.outTag('/'+tag) elif id == eBREAK_LINE: out.outTag('br') out.close(meta) return has_html, has_frameset, has_common_tag
def _test_generator(self, doc, expect): fp = StringIO.StringIO(doc) tokens = gp.generate_tokens(fp) self._test_generator1(tokens, expect)