Example #1
0
def get_xml_tree(html_string):
    try:
        parser = ET.XMLParser()
        parser.parser.UseForeignDTD(True)
        parser.entity.update((x, unichr(i)) for x, i in name2codepoint.items())
        tree = ET.fromstring(html_string, parser=parser)
    except:  # noqa FIXME: figure out what we expect this to throw.
        dump_html(html_string)
        raise
    return tree
Example #2
0
 def set_encodedContent(self, value):
     """
     Cleans out the encoded content as it is passed in. Makes clean XHTML.
     """
     for key, val in name2codepoint.items():
         value = value.replace("&%s;" % key, unichr(val))
     parser = ClozeHTMLParser()
     parser.feed(value)
     parser.close()
     self.parts = parser.result
     encodedContent = ""
     for shown, hidden in parser.result:
         encodedContent += shown
         if hidden:
             encodedContent += " <u>"
             encodedContent += hidden
             encodedContent += "</u> "
     self._encodedContent = encodedContent
Example #3
0
 def set_encodedContent(self, value):
     """
     Cleans out the encoded content as it is passed in. Makes clean XHTML.
     """
     for key, val in name2codepoint.items():
         value = value.replace('&%s;' % key, unichr(val))
     value = re.sub(r'font-family:\s*"([^"]+)"', r'font-family: \1', value)
     parser = ClozeHTMLParser()
     parser.feed(value)
     parser.close()
     self.parts = parser.result
     encodedContent = ''
     for shown, hidden in parser.result:
         encodedContent += shown
         if hidden:
             encodedContent += ' <u>'
             encodedContent += hidden
             encodedContent += '</u> ' 
     self._encodedContent = encodedContent
Example #4
0
 def set_encodedContent(self, value):
     """
     Cleans out the encoded content as it is passed in. Makes clean XHTML.
     """
     for key, val in name2codepoint.items():
         value = value.replace('&%s;' % key, unichr(val))
     # workaround for Microsoft Word which incorrectly quotes font names
     value = re.sub(r'font-family:\s*"([^"]+)"', r'font-family: \1', value)
     parser = ListaHTMLParser()
     parser.feed(value)
     parser.close()
     self.parts = parser.result
     encodedContent = ''
     for shown, hidden in parser.result:
         encodedContent += shown
         if hidden:
             encodedContent += ' <u>'
             encodedContent += hidden
             encodedContent += '</u> '
     self._encodedContent = encodedContent
Example #5
0
MASTER_TABLE_NAME = "sesql_index"

# Type map, associating Django classes to SeSQL tables
TYPE_MAP = ((models.Map, "sesql_default"), )

# Additional indexes to create
CROSS_INDEXES = ()

#
# Cleanup configuration
#

from htmlentitydefs import name2codepoint
from xml.sax import saxutils

html_entities = dict([('&%s;' % k, unichr(v).encode(config.CHARSET)) for k,v in name2codepoint.items() ])
ADDITIONAL_CLEANUP_FUNCTION = lambda value: saxutils.unescape(value, html_entities)

#
# Query configuration
#

# General condition to skip indexing content
SKIP_CONDITION = None

# Default sort order for queries
DEFAULT_ORDER = ('-modified_at',)

# Default LIMIT in short queries
DEFAULT_LIMIT = 20
except ImportError:
    # Python 3...
    imap = map
    unichr = chr
    str = str
    unicode = str
    bytes = bytes
    basestring = (str, bytes)

try:
    from htmlentitydefs import name2codepoint
except ImportError:
    # Python 3...
    from html.entities import name2codepoint

html_entities = {k: unichr(v) for k, v in name2codepoint.items()}
html_entities_re = re.compile(r"&#?\w+;")
emoji_ranges = ((u'\U0001f300', u'\U0001f5ff'), (u'\U0001f600', u'\U0001f64f'),
                (u'\U0001f680', u'\U0001f6c5'), (u'\u2600', u'\u26ff'),
                (u'\U0001f170', u'\U0001f19a'))


def _converthtmlentities(msg):
    def replace_entities(s):
        s = s.group(0)[1:-1]  # remove & and ;
        if s[0] == '#':
            try:
                return unichr(int(s[2:], 16) if s[1] in 'xX' else int(s[1:]))
            except ValueError:
                return '&#' + s + ';'
        else:
ascii_white_hexent = ''.join('&#x{:x};'.format(ord(_))
                             for _ in string.whitespace)
x00_1f_decent = chars['&#00_1f']
x00_1f_hexent = chars['&#x00_1f']
x00_1f = chars.x00_1f
x80_9f_decent = chars['&#80_9f']
x80_9f_hexent = chars['&#x80_9f']
x80_9f = chars.x80_9f
xa0_ff_decent = chars['&#a0_ff']
xa0_ff_hexent = chars['&#xa0_ff']
xa0_ff = chars.xa0_ff

known_entities_ref, known_entities = map(
    ''.join,
    zip(*[
        ('&{};'.format(k), unichr(v)) for k, v in name2codepoint.items()
        if k not in {'lang', 'rang'}  # Python maps have an error here
    ]))

maxunicodeoverflow_dechex = '&#{0};&#x{0:x};'.format(sys.maxunicode + 1)
int32t_dechex = '&#{0};&#x{0:x};'.format((2 << 30) - 1)
int32t_overflow_dechex = '&#{0};&#x{0:x};'.format(2 << 30)
data_decode_map = (
    ('ascii_letters_decent', ascii_letters_decent, ascii_letters),
    ('ascii_letters_hexent', ascii_letters_hexent, ascii_letters),
    ('ascii_digit_decent', ascii_digit_decent, string.digits),
    ('ascii_digit_hexent', ascii_digit_hexent, string.digits),
    ('ascii_punct_decent', ascii_punct_decent, string.punctuation),
    ('ascii_punct_hexent', ascii_punct_hexent, string.punctuation),
    ('ascii_white_decent', ascii_white_decent, string.whitespace),
    ('ascii_white_hexent', ascii_white_hexent, string.whitespace),
Example #8
0
MASTER_TABLE_NAME = "sesql_index"

# Type map, associating Django classes to SeSQL tables
TYPE_MAP = ((models.Map, "sesql_default"), )

# Additional indexes to create
CROSS_INDEXES = ()

#
# Cleanup configuration
#

from htmlentitydefs import name2codepoint
from xml.sax import saxutils

html_entities = dict([('&%s;' % k, unichr(v).encode(CHARSET)) for k,v in name2codepoint.items() ])
ADDITIONAL_CLEANUP_FUNCTION = lambda value: saxutils.unescape(value, html_entities)

#
# Query configuration
#

# General condition to skip indexing content
SKIP_CONDITION = None

# Default sort order for queries
DEFAULT_ORDER = ('-modified_at',)

# Default LIMIT in short queries
DEFAULT_LIMIT = 20