def __call__(self): text = self.orig if text is None: text = '' text = safe_decode(text, errors='replace') # Do & separately, else, it may replace an already-inserted & from # an entity with &, so < becomes < becomes &lt; text = text.replace('&', '&') # Make funny characters into html entity defs for entity, codepoint in name2codepoint.items(): if entity != 'amp': text = text.replace(unichr(codepoint), '&' + entity + ';') text = self.urlRegexp.subn(self.replaceURL, text)[0] text = self.emailRegexp.subn(self.replaceEmail, text)[0] text = self.indentRegexp.subn(self.indentWhitespace, text)[0] # convert windows line endings text = text.replace('\r\n', '\n') # Finally, make \n's into br's text = text.replace('\n', '<br />') text = text.encode('utf-8') return text
def get_xml_tree(html_string): try: entities_defs = [] for x, i in name2codepoint.items(): entities_defs.append(' <!ENTITY {x} "&#{i};">'.format(**locals())) docstring = "<!DOCTYPE html [\n{}\n]>".format('\n'.join(entities_defs)) html_string = html_string.replace("<!DOCTYPE html>", docstring, 1) tree = ET.fromstring(html_string) except: # noqa FIXME: figure out what we expect this to throw. dump_html(html_string) raise return tree
'uuml': u'\xfc', 'weierp': u'\u2118', 'Xi': u'\u039e', 'xi': u'\u03be', 'Yacute': u'\xdd', 'yacute': u'\xfd', 'yen': u'\xa5', 'yuml': u'\xff', 'Yuml': u'\u0178', 'Zeta': u'\u0396', 'zeta': u'\u03b6', 'zwj': u'\u200d', 'zwnj': u'\u200c', } known_entities = dict([(k, uniChr(v)) for k, v in name2codepoint.items()]) for k in greeks: if k not in known_entities: known_entities[k] = greeks[k] f = isPy3 and asBytes or asUnicode K = list(known_entities.keys()) for k in K: known_entities[f(k)] = known_entities[k] del k, f, K #------------------------------------------------------------------------ class ParaFrag(ABag): """class ParaFrag contains the intermediate representation of string segments as they are being parsed by the ParaParser. fontname, fontSize, rise, textColor, cbDefn
return "{0}(style={1})".format(self.__class__.__name__, self.style) @property def width(self): raise NotImplementedError def height(self, document): raise NotImplementedError def render(self): raise NotImplementedError NAME2CHAR = { name: chr(codepoint) for name, codepoint in name2codepoint.items() } class StyledText(InlineStyled, AcceptNoneAttributeType): """Base class for text that has a :class:`TextStyle` associated with it.""" def __add__(self, other): """Return the concatenation of this styled text and `other`. If `other` is `None`, this styled text itself is returned.""" return MixedStyledText([self, other]) if other is not None else self def __radd__(self, other): """Return the concatenation of `other` and this styled text. If `other` is `None`, this styled text itself is returned.""" return MixedStyledText([other, self]) if other is not None else self
"""This module defines a formatter for [HTML](../../formats/html.md).""" from html.entities import name2codepoint import numpy as np from .base import BaseFormatter HTML_ENTITY_MAP: dict = { chr(value): key for (key, value) in name2codepoint.items() } """A dictionary mapping unicode characters to their equivalent HTML entities.""" HTML_ENTITY_MAP[" "] = "nbsp;" class HtmlFormatter(BaseFormatter): """ A formatter for [HTML](../../formats/html.md). Inherits [`BaseFormatter`][picharsso.format.base.BaseFormatter]. """ @staticmethod def color(text, color): return f'<span style="color : rgb{tuple(color)};">{text}</span>' @staticmethod def translate(text_matrix): unique_chars = np.unique(text_matrix)
# # tweetokenize: Regular expression based tokenizer for Twitter # Copyright: (c) 2013, Jared Suttles. All rights reserved. # License: BSD, see LICENSE for details. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - import re from os import path #from itertools import imap try: from itertools import imap except ImportError: # Python 3... imap=map from html.entities import name2codepoint html_entities = {k: chr(v) for k, v in name2codepoint.items()} html_entities_re = re.compile(r"&#?\w+;") emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a')) emoji_flags = {u'\U0001f1ef\U0001f1f5', u'\U0001f1f0\U0001f1f7', u'\U0001f1e9\U0001f1ea', u'\U0001f1e8\U0001f1f3', u'\U0001f1fa\U0001f1f8', u'\U0001f1eb\U0001f1f7', u'\U0001f1ea\U0001f1f8', u'\U0001f1ee\U0001f1f9', u'\U0001f1f7\U0001f1fa', u'\U0001f1ec\U0001f1e7'} def _converthtmlentities(msg): def replace_entities(s): s = s.group(0)[1:-1] # remove & and ; if s[0] == '#': try: return chr(int(s[2:],16) if s[1] in 'xX' else int(s[1:]))
'uuml': u'\xfc', 'weierp': u'\u2118', 'Xi': u'\u039e', 'xi': u'\u03be', 'Yacute': u'\xdd', 'yacute': u'\xfd', 'yen': u'\xa5', 'yuml': u'\xff', 'Yuml': u'\u0178', 'Zeta': u'\u0396', 'zeta': u'\u03b6', 'zwj': u'\u200d', 'zwnj': u'\u200c', } known_entities = dict([(k,uniChr(v)) for k,v in name2codepoint.items()]) for k in greeks: if k not in known_entities: known_entities[k] = greeks[k] f = isPy3 and asBytes or asUnicode K = list(known_entities.keys()) for k in K: known_entities[f(k)] = known_entities[k] del k, f, K #------------------------------------------------------------------------ class ParaFrag(ABag): """class ParaFrag contains the intermediate representation of string segments as they are being parsed by the ParaParser. fontname, fontSize, rise, textColor, cbDefn """
def convertHtmlToWebIntelligentPlainText(orig): """Converts text/html to text/x-web-intelligent. """ preRegex = re.compile(r'<\s*pre[^>]*>(.*?)<\s*/pre\s*>', re.I | re.S) tagWhitespaceRegex = re.compile(r'\s+((<[^>]+>)\s+)+') whitespaceRegex = re.compile(r'\s+') tdRegex = re.compile(r'<\s*(td)([^>])*>', re.I) breakRegex = re.compile(r'<\s*(br)\s*/?>', re.I) startBlockRegex = re.compile(r'<\s*(dt)[^>]*>', re.I) endBlockRegex = re.compile(r'<\s*/\s*(p|div|tr|ul|ol|dl)[^>]*>', re.I) indentBlockRegex = re.compile(r'<\s*(blockquote|dd)[^>]*>', re.I) listBlockRegex = re.compile(r'<\s*(li)[^>]*>', re.I) tagRegex = re.compile(r'<[^>]+>', re.I | re.M) # Save all <pre> sections and restore after other transforms preSections = {} def savePres(match): marker = '__pre_marker__%d__' % len(preSections) preSections[marker] = match.group(1) return marker if orig is None: orig = '' text = preRegex.sub(savePres, orig) def fixTagWhitespace(match): """Make whitespace-tag-whitespace into whitespace-tag. Repeat this in case there are directly nested tags. """ # Remove any superfluous whitespace, but preserve one leading space return ' ' + whitespaceRegex.sub('', match.group(0)) text = tagWhitespaceRegex.sub(fixTagWhitespace, text) # Make all whitespace into a single space text = whitespaceRegex.sub(' ', text) # Fix entities text = text.replace(' ', ' ') for entity, codepoint in name2codepoint.items(): # Do < and > later, else we may be creating what looks like # tags if entity != 'lt' and entity != 'gt' and entity != 'amp': text = text.replace( '&' + entity + ';', '&#' + str(codepoint) + ';' ) # XXX: Remove <head>, <script>, <style> ? # Make tabs out of td's text = tdRegex.sub('\t', text) # Make br's and li's into newlines text = breakRegex.sub('\n', text) # Make the start of list blocks into paragraphs text = startBlockRegex.sub('\n\n', text) # Make the close of p's, div's and tr's into paragraphs text = endBlockRegex.sub('\n\n', text) # Make blockquotes and dd blocks indented text = indentBlockRegex.sub('\n\n ', text) # Make list items indented and prefixed with - text = listBlockRegex.sub('\n\n - ', text) # Remove other tags text = tagRegex.sub('', text) # Fix < and > entities text = text.replace('<', '<') text = text.replace('>', '>') text = text.replace('&', '&') # Restore pres for marker, section in preSections.items(): text = text.replace(marker, '\n\n' + section + '\n\n') return text
#!/usr/bin/env python # # tweetokenize: Regular expression based tokenizer for Twitter # Copyright: (c) 2013, Jared Suttles. All rights reserved. # License: BSD, see LICENSE for details. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - import re from os import path from html.entities import name2codepoint html_entities = {k: chr(v) for k, v in name2codepoint.items()} html_entities_re = re.compile(r"&#?\w+;") emoji_ranges = (('\U0001f300', '\U0001f5ff'), ('\U0001f600', '\U0001f64f'), ('\U0001f680', '\U0001f6c5'), ('\u2600', '\u26ff'), ('\U0001f170', '\U0001f19a')) emoji_flags = {'\U0001f1ef\U0001f1f5', '\U0001f1f0\U0001f1f7', '\U0001f1e9\U0001f1ea', '\U0001f1e8\U0001f1f3', '\U0001f1fa\U0001f1f8', '\U0001f1eb\U0001f1f7', '\U0001f1ea\U0001f1f8', '\U0001f1ee\U0001f1f9', '\U0001f1f7\U0001f1fa', '\U0001f1ec\U0001f1e7'} def _converthtmlentities(msg): def replace_entities(s): s = s.group(0)[1:-1] # remove & and ; if s[0] == '#': try: return chr(int(s[2:],16) if s[1] in 'xX' else int(s[1:])) except ValueError: return '&#' + s + ';' else: try: return html_entities[s]
def __repr__(self): return "{0}(style={1})".format(self.__class__.__name__, self.style) @property def width(self): raise NotImplementedError def height(self, document): raise NotImplementedError def render(self): raise NotImplementedError NAME2CHAR = {name: chr(codepoint) for name, codepoint in name2codepoint.items()} class StyledText(Styled, AcceptNoneAttributeType): """Base class for text that has a :class:`TextStyle` associated with it.""" style_class = TextStyle def __add__(self, other): """Return the concatenation of this styled text and `other`. If `other` is `None`, this styled text itself is returned.""" return MixedStyledText([self, other]) if other is not None else self def __radd__(self, other): """Return the concatenation of `other` and this styled text. If `other` is `None`, this styled text itself is returned."""