class Title(utils.web.HtmlToText): entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' def __init__(self): self.inTitle = False self.inSvg = False utils.web.HtmlToText.__init__(self) @property def inHtmlTitle(self): return self.inTitle and not self.inSvg def handle_starttag(self, tag, attrs): if tag == 'title': self.inTitle = True elif tag == 'svg': self.inSvg = True def handle_endtag(self, tag): if tag == 'title': self.inTitle = False elif tag == 'svg': self.inSvg = False def append(self, data): if self.inHtmlTitle: super(Title, self).append(data)
class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' entitydefs['apos'] = '\'' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace super(HtmlToText, self).__init__() def append(self, data): self.data.append(data) def handle_starttag(self, tag, attr): self.append(self.tagReplace) def handle_endtag(self, tag): self.append(self.tagReplace) def handle_data(self, data): self.append(data) def handle_entityref(self, data): if minisix.PY3: if data in name2codepoint: self.append(chr(name2codepoint[data])) elif isinstance(data, bytes): self.append(data.decode()) else: self.append(data) else: if data in name2codepoint: self.append(chr(name2codepoint[data])) elif isinstance(data, str): self.append(data.decode('utf8', errors='replace')) else: self.append(data) def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text) def handle_charref(self, name): self.append(self.unescape('&#%s;' % name))
class Title(HTMLParser): entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' entitydefs['apos'] = '\'' def __init__(self): self.inTitle = False self.inSvg = False self.title = '' HTMLParser.__init__(self) @property def inHtmlTitle(self): return self.inTitle and not self.inSvg def handle_starttag(self, tag, attrs): if tag == 'title': self.inTitle = True elif tag == 'svg': self.inSvg = True def handle_endtag(self, tag): if tag == 'title': self.inTitle = False elif tag == 'svg': self.inSvg = False def handle_data(self, data): if self.inHtmlTitle: self.title += data def handle_entityref(self, name): if self.inHtmlTitle: if name in self.entitydefs: self.title += self.entitydefs[name] def handle_charref(self, name): if self.inHtmlTitle: self.title += (unichr if minisix.PY2 else chr)(int(name))
if re_names: d[k] = re_names.sub(r"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(r'_\1_ (qv)', v) if re_characters: d[k] = re_characters.sub(r'#\1# (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, re_characters, lastKey=lastKey) # Handle HTML/XML/SGML entities. entitydefs = entitydefs.copy() entitydefsget = entitydefs.get entitydefs['nbsp'] = ' ' sgmlentity = { 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-' } sgmlentityget = sgmlentity.get _sgmlentkeys = list(sgmlentity.keys()) entcharrefs = {}
d[k] = re_names.sub(r"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(r'_\1_ (qv)', v) if re_characters: d[k] = re_characters.sub(r'#\1# (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, re_characters, lastKey=lastKey) # Handle HTML/XML/SGML entities. try: from html.entities import entitydefs except ImportError: from htmlentitydefs import entitydefs entitydefs = entitydefs.copy() entitydefsget = entitydefs.get entitydefs['nbsp'] = ' ' sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-'} sgmlentityget = sgmlentity.get _sgmlentkeys = sgmlentity.keys() entcharrefs = {} entcharrefsget = entcharrefs.get for _k, _v in entitydefs.items(): if _k in _sgmlentkeys: continue if _v[0:2] == '&#': dec_code = _v[1:-1] _v = unichr(int(_v[2:-1])) entcharrefs[dec_code] = _v