def get_xml_tree(html_string): try: parser = ET.XMLParser() parser.parser.UseForeignDTD(True) parser.entity.update((x, unichr(i)) for x, i in name2codepoint.items()) tree = ET.fromstring(html_string, parser=parser) except: # noqa FIXME: figure out what we expect this to throw. dump_html(html_string) raise return tree
def set_encodedContent(self, value): """ Cleans out the encoded content as it is passed in. Makes clean XHTML. """ for key, val in name2codepoint.items(): value = value.replace("&%s;" % key, unichr(val)) parser = ClozeHTMLParser() parser.feed(value) parser.close() self.parts = parser.result encodedContent = "" for shown, hidden in parser.result: encodedContent += shown if hidden: encodedContent += " <u>" encodedContent += hidden encodedContent += "</u> " self._encodedContent = encodedContent
def set_encodedContent(self, value): """ Cleans out the encoded content as it is passed in. Makes clean XHTML. """ for key, val in name2codepoint.items(): value = value.replace('&%s;' % key, unichr(val)) value = re.sub(r'font-family:\s*"([^"]+)"', r'font-family: \1', value) parser = ClozeHTMLParser() parser.feed(value) parser.close() self.parts = parser.result encodedContent = '' for shown, hidden in parser.result: encodedContent += shown if hidden: encodedContent += ' <u>' encodedContent += hidden encodedContent += '</u> ' self._encodedContent = encodedContent
def set_encodedContent(self, value): """ Cleans out the encoded content as it is passed in. Makes clean XHTML. """ for key, val in name2codepoint.items(): value = value.replace('&%s;' % key, unichr(val)) # workaround for Microsoft Word which incorrectly quotes font names value = re.sub(r'font-family:\s*"([^"]+)"', r'font-family: \1', value) parser = ListaHTMLParser() parser.feed(value) parser.close() self.parts = parser.result encodedContent = '' for shown, hidden in parser.result: encodedContent += shown if hidden: encodedContent += ' <u>' encodedContent += hidden encodedContent += '</u> ' self._encodedContent = encodedContent
MASTER_TABLE_NAME = "sesql_index" # Type map, associating Django classes to SeSQL tables TYPE_MAP = ((models.Map, "sesql_default"), ) # Additional indexes to create CROSS_INDEXES = () # # Cleanup configuration # from htmlentitydefs import name2codepoint from xml.sax import saxutils html_entities = dict([('&%s;' % k, unichr(v).encode(config.CHARSET)) for k,v in name2codepoint.items() ]) ADDITIONAL_CLEANUP_FUNCTION = lambda value: saxutils.unescape(value, html_entities) # # Query configuration # # General condition to skip indexing content SKIP_CONDITION = None # Default sort order for queries DEFAULT_ORDER = ('-modified_at',) # Default LIMIT in short queries DEFAULT_LIMIT = 20
except ImportError: # Python 3... imap = map unichr = chr str = str unicode = str bytes = bytes basestring = (str, bytes) try: from htmlentitydefs import name2codepoint except ImportError: # Python 3... from html.entities import name2codepoint html_entities = {k: unichr(v) for k, v in name2codepoint.items()} html_entities_re = re.compile(r"&#?\w+;") emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'), (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'), (u'\U0001f170', u'\U0001f19a')) def _converthtmlentities(msg): def replace_entities(s): s = s.group(0)[1:-1] # remove & and ; if s[0] == '#': try: return unichr(int(s[2:], 16) if s[1] in 'xX' else int(s[1:])) except ValueError: return '&#' + s + ';' else:
ascii_white_hexent = ''.join('&#x{:x};'.format(ord(_)) for _ in string.whitespace) x00_1f_decent = chars['�_1f'] x00_1f_hexent = chars['�_1f'] x00_1f = chars.x00_1f x80_9f_decent = chars['P_9f'] x80_9f_hexent = chars['€_9f'] x80_9f = chars.x80_9f xa0_ff_decent = chars['&#a0_ff'] xa0_ff_hexent = chars[' _ff'] xa0_ff = chars.xa0_ff known_entities_ref, known_entities = map( ''.join, zip(*[ ('&{};'.format(k), unichr(v)) for k, v in name2codepoint.items() if k not in {'lang', 'rang'} # Python maps have an error here ])) maxunicodeoverflow_dechex = '&#{0};&#x{0:x};'.format(sys.maxunicode + 1) int32t_dechex = '&#{0};&#x{0:x};'.format((2 << 30) - 1) int32t_overflow_dechex = '&#{0};&#x{0:x};'.format(2 << 30) data_decode_map = ( ('ascii_letters_decent', ascii_letters_decent, ascii_letters), ('ascii_letters_hexent', ascii_letters_hexent, ascii_letters), ('ascii_digit_decent', ascii_digit_decent, string.digits), ('ascii_digit_hexent', ascii_digit_hexent, string.digits), ('ascii_punct_decent', ascii_punct_decent, string.punctuation), ('ascii_punct_hexent', ascii_punct_hexent, string.punctuation), ('ascii_white_decent', ascii_white_decent, string.whitespace), ('ascii_white_hexent', ascii_white_hexent, string.whitespace),
MASTER_TABLE_NAME = "sesql_index" # Type map, associating Django classes to SeSQL tables TYPE_MAP = ((models.Map, "sesql_default"), ) # Additional indexes to create CROSS_INDEXES = () # # Cleanup configuration # from htmlentitydefs import name2codepoint from xml.sax import saxutils html_entities = dict([('&%s;' % k, unichr(v).encode(CHARSET)) for k,v in name2codepoint.items() ]) ADDITIONAL_CLEANUP_FUNCTION = lambda value: saxutils.unescape(value, html_entities) # # Query configuration # # General condition to skip indexing content SKIP_CONDITION = None # Default sort order for queries DEFAULT_ORDER = ('-modified_at',) # Default LIMIT in short queries DEFAULT_LIMIT = 20