Example #1
0
class Title(utils.web.HtmlToText):
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '

    def __init__(self):
        self.inTitle = False
        self.inSvg = False
        utils.web.HtmlToText.__init__(self)

    @property
    def inHtmlTitle(self):
        return self.inTitle and not self.inSvg

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.inTitle = True
        elif tag == 'svg':
            self.inSvg = True

    def handle_endtag(self, tag):
        if tag == 'title':
            self.inTitle = False
        elif tag == 'svg':
            self.inSvg = False

    def append(self, data):
        if self.inHtmlTitle:
            super(Title, self).append(data)
Example #2
0
class HtmlToText(HTMLParser, object):
    """Taken from some eff-bot code on c.l.p."""
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '
    entitydefs['apos'] = '\''

    def __init__(self, tagReplace=' '):
        self.data = []
        self.tagReplace = tagReplace
        super(HtmlToText, self).__init__()

    def append(self, data):
        self.data.append(data)

    def handle_starttag(self, tag, attr):
        self.append(self.tagReplace)

    def handle_endtag(self, tag):
        self.append(self.tagReplace)

    def handle_data(self, data):
        self.append(data)

    def handle_entityref(self, data):
        if minisix.PY3:
            if data in name2codepoint:
                self.append(chr(name2codepoint[data]))
            elif isinstance(data, bytes):
                self.append(data.decode())
            else:
                self.append(data)
        else:
            if data in name2codepoint:
                self.append(chr(name2codepoint[data]))
            elif isinstance(data, str):
                self.append(data.decode('utf8', errors='replace'))
            else:
                self.append(data)

    def getText(self):
        text = ''.join(self.data).strip()
        return normalizeWhitespace(text)

    def handle_charref(self, name):
        self.append(self.unescape('&#%s;' % name))
Example #3
0
class Title(HTMLParser):
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '
    entitydefs['apos'] = '\''

    def __init__(self):
        self.inTitle = False
        self.inSvg = False
        self.title = ''
        HTMLParser.__init__(self)

    @property
    def inHtmlTitle(self):
        return self.inTitle and not self.inSvg

    def handle_starttag(self, tag, attrs):
        if tag == 'title':
            self.inTitle = True
        elif tag == 'svg':
            self.inSvg = True

    def handle_endtag(self, tag):
        if tag == 'title':
            self.inTitle = False
        elif tag == 'svg':
            self.inSvg = False

    def handle_data(self, data):
        if self.inHtmlTitle:
            self.title += data

    def handle_entityref(self, name):
        if self.inHtmlTitle:
            if name in self.entitydefs:
                self.title += self.entitydefs[name]

    def handle_charref(self, name):
        if self.inHtmlTitle:
            self.title += (unichr if minisix.PY2 else chr)(int(name))
Example #4
0
                    if re_names:
                        d[k] = re_names.sub(r"'\1' (qv)", v)
                    if re_titles:
                        d[k] = re_titles.sub(r'_\1_ (qv)', v)
                    if re_characters:
                        d[k] = re_characters.sub(r'#\1# (qv)', v)
            elif isinstance(v, (list, dict)):
                _putRefs(d[k],
                         re_titles,
                         re_names,
                         re_characters,
                         lastKey=lastKey)


# Handle HTML/XML/SGML entities.
entitydefs = entitydefs.copy()
entitydefsget = entitydefs.get
entitydefs['nbsp'] = ' '

sgmlentity = {
    'lt': '<',
    'gt': '>',
    'amp': '&',
    'quot': '"',
    'apos': '\'',
    'ndash': '-'
}
sgmlentityget = sgmlentity.get
_sgmlentkeys = list(sgmlentity.keys())

entcharrefs = {}
Example #5
0
                        d[k] = re_names.sub(r"'\1' (qv)", v)
                    if re_titles:
                        d[k] = re_titles.sub(r'_\1_ (qv)', v)
                    if re_characters:
                        d[k] = re_characters.sub(r'#\1# (qv)', v)
            elif isinstance(v, (list, dict)):
                _putRefs(d[k], re_titles, re_names, re_characters,
                        lastKey=lastKey)


# Handle HTML/XML/SGML entities.
try:
    from html.entities import entitydefs
except ImportError:
    from htmlentitydefs import entitydefs
entitydefs = entitydefs.copy()
entitydefsget = entitydefs.get
entitydefs['nbsp'] = ' '

sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-'}
sgmlentityget = sgmlentity.get
_sgmlentkeys = sgmlentity.keys()

entcharrefs = {}
entcharrefsget = entcharrefs.get
for _k, _v in entitydefs.items():
    if _k in _sgmlentkeys: continue
    if _v[0:2] == '&#':
        dec_code = _v[1:-1]
        _v = unichr(int(_v[2:-1]))
        entcharrefs[dec_code] = _v