Python UnescapeHtml Examples, grit.util.UnescapeHtml Python Examples

Example #1

0

Show file

 def testEscapeUnescaped(self):
   text = '&copy;&nbsp; & &quot;&lt;hello&gt;&quot;'
   unescaped = util.UnescapeHtml(text)
   self.failUnless(unescaped == u'\u00a9\u00a0 & "<hello>"')
   escaped_unescaped = util.EscapeHtml(unescaped, True)
   self.failUnless(escaped_unescaped ==
                   u'\u00a9\u00a0 &amp; &quot;&lt;hello&gt;&quot;')

Example #2

0

Show file

File: tr_html.py Project: 2002a1116/naiveproxy

def HtmlToMessage(html, include_block_tags=False, description=''):
    '''Takes a bit of HTML, which must contain only "inline" HTML elements,
  and changes it into a tclib.Message.  This involves escaping any entities and
  replacing any HTML code with placeholders.

  If include_block_tags is true, no error will be given if block tags (e.g.
  <p> or <br>) are included in the HTML.

  Args:
    html: 'Hello <b>[USERNAME]</b>, how&nbsp;<i>are</i> you?'
    include_block_tags: False

  Return:
    tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, '
                  'howNBSPSTART_ITALICareEND_ITALIC you?',
                  [ Placeholder('START_BOLD', '<b>', ''),
                    Placeholder('USERNAME', '[USERNAME]', ''),
                    Placeholder('END_BOLD', '</b>', ''),
                    Placeholder('START_ITALIC', '<i>', ''),
                    Placeholder('END_ITALIC', '</i>', ''), ])
  '''
    # Approach is:
    # - first placeholderize, finding <elements>, [REPLACEABLES] and &nbsp;
    # - then escape all character entities in text in-between placeholders

    parts = []  # List of strings (for text chunks) and tuples (ID, original)
    # for placeholders

    count_names = {}  # Map of base names to number of times used
    end_names = {
    }  # Map of base names to stack of end tags (for correct nesting)

    def MakeNameClosure(base, type=''):
        '''Returns a closure that can be called once all names have been allocated
    to return the final name of the placeholder.  This allows us to minimally
    number placeholders for non-overlap.

    Also ensures that END_XXX_Y placeholders have the same Y as the
    corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same
    type.

    Args:
      base: 'phname'
      type: '' | 'begin' | 'end'

    Return:
      Closure()
    '''
        name = base.upper()
        if type != '':
            name = ('%s_%s' % (type, base)).upper()

        if name in count_names.keys():
            count_names[name] += 1
        else:
            count_names[name] = 1

        def MakeFinalName(name_=name, index=count_names[name] - 1):
            if (type.lower() == 'end' and base in end_names.keys()
                    and len(end_names[base])):
                return end_names[base].pop(-1)  # For correct nesting
            if count_names[name_] != 1:
                name_ = '%s_%s' % (name_, _SUFFIXES[index])
                # We need to use a stack to ensure that the end-tag suffixes match
                # the begin-tag suffixes.  Only needed when more than one tag of the
                # same type.
                if type == 'begin':
                    end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper()
                    if base in end_names.keys():
                        end_names[base].append(end_name)
                    else:
                        end_names[base] = [end_name]

            return name_

        return MakeFinalName

    current = 0
    last_nobreak = False

    while current < len(html):
        m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:])
        if m:
            last_nobreak = True
            current += m.end()
            continue

        m = _NBSP.match(html[current:])
        if m:
            parts.append((MakeNameClosure('SPACE'), m.group()))
            current += m.end()
            continue

        m = _REPLACEABLE.match(html[current:])
        if m:
            # Replaceables allow - but placeholders don't, so replace - with _
            ph_name = MakeNameClosure('X_%s_X' %
                                      m.group('name').replace('-', '_'))
            parts.append((ph_name, m.group()))
            current += m.end()
            continue

        m = _SPECIAL_ELEMENT.match(html[current:])
        if m:
            if not include_block_tags:
                if last_nobreak:
                    last_nobreak = False
                else:
                    raise exception.BlockTagInTranslateableChunk(html)
            element_name = 'block'  # for simplification
            # Get the appropriate group name
            for group in m.groupdict().keys():
                if m.groupdict()[group]:
                    break
            parts.append((MakeNameClosure(element_name, 'begin'),
                          html[current:current + m.start(group)]))
            parts.append(m.group(group))
            parts.append((MakeNameClosure(element_name, 'end'),
                          html[current + m.end(group):current + m.end()]))
            current += m.end()
            continue

        m = _ELEMENT.match(html[current:])
        if m:
            element_name = m.group('element').lower()
            if not include_block_tags and not element_name in _INLINE_TAGS:
                if last_nobreak:
                    last_nobreak = False
                else:
                    raise exception.BlockTagInTranslateableChunk(
                        html[current:])
            if element_name in _HTML_PLACEHOLDER_NAMES:  # use meaningful names
                element_name = _HTML_PLACEHOLDER_NAMES[element_name]

            # Make a name for the placeholder
            type = ''
            if not m.group('empty'):
                if m.group('closing'):
                    type = 'end'
                else:
                    type = 'begin'
            parts.append((MakeNameClosure(element_name, type), m.group()))
            current += m.end()
            continue

        if len(parts) and isinstance(parts[-1], types.StringTypes):
            parts[-1] += html[current]
        else:
            parts.append(html[current])
        current += 1

    msg_text = ''
    placeholders = []
    for part in parts:
        if isinstance(part, types.TupleType):
            final_name = part[0]()
            original = part[1]
            msg_text += final_name
            placeholders.append(
                tclib.Placeholder(final_name, original, '(HTML code)'))
        else:
            msg_text += part

    msg = tclib.Message(text=msg_text,
                        placeholders=placeholders,
                        description=description)
    content = msg.GetContent()
    for ix in range(len(content)):
        if isinstance(content[ix], types.StringTypes):
            content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False)

    return msg

Example #3

0

Show file

File: util_unittest.py Project: sz21/WTL-DUI

 def testUnescapeHtml(self):
     self.failUnless(util.UnescapeHtml('&#1010;') == unichr(1010))
     self.failUnless(util.UnescapeHtml('&#xABcd;') == unichr(43981))