def testEscapeUnescaped(self): text = '© & "<hello>"' unescaped = util.UnescapeHtml(text) self.failUnless(unescaped == u'\u00a9\u00a0 & "<hello>"') escaped_unescaped = util.EscapeHtml(unescaped, True) self.failUnless(escaped_unescaped == u'\u00a9\u00a0 & "<hello>"')
def HtmlToMessage(html, include_block_tags=False, description=''): '''Takes a bit of HTML, which must contain only "inline" HTML elements, and changes it into a tclib.Message. This involves escaping any entities and replacing any HTML code with placeholders. If include_block_tags is true, no error will be given if block tags (e.g. <p> or <br>) are included in the HTML. Args: html: 'Hello <b>[USERNAME]</b>, how <i>are</i> you?' include_block_tags: False Return: tclib.Message('Hello START_BOLD1USERNAMEEND_BOLD, ' 'howNBSPSTART_ITALICareEND_ITALIC you?', [ Placeholder('START_BOLD', '<b>', ''), Placeholder('USERNAME', '[USERNAME]', ''), Placeholder('END_BOLD', '</b>', ''), Placeholder('START_ITALIC', '<i>', ''), Placeholder('END_ITALIC', '</i>', ''), ]) ''' # Approach is: # - first placeholderize, finding <elements>, [REPLACEABLES] and # - then escape all character entities in text in-between placeholders parts = [] # List of strings (for text chunks) and tuples (ID, original) # for placeholders count_names = {} # Map of base names to number of times used end_names = { } # Map of base names to stack of end tags (for correct nesting) def MakeNameClosure(base, type=''): '''Returns a closure that can be called once all names have been allocated to return the final name of the placeholder. This allows us to minimally number placeholders for non-overlap. Also ensures that END_XXX_Y placeholders have the same Y as the corresponding BEGIN_XXX_Y placeholder when we have nested tags of the same type. Args: base: 'phname' type: '' | 'begin' | 'end' Return: Closure() ''' name = base.upper() if type != '': name = ('%s_%s' % (type, base)).upper() if name in count_names.keys(): count_names[name] += 1 else: count_names[name] = 1 def MakeFinalName(name_=name, index=count_names[name] - 1): if (type.lower() == 'end' and base in end_names.keys() and len(end_names[base])): return end_names[base].pop(-1) # For correct nesting if count_names[name_] != 1: name_ = '%s_%s' % (name_, _SUFFIXES[index]) # We need to use a stack to ensure that the end-tag suffixes match # the begin-tag suffixes. Only needed when more than one tag of the # same type. if type == 'begin': end_name = ('END_%s_%s' % (base, _SUFFIXES[index])).upper() if base in end_names.keys(): end_names[base].append(end_name) else: end_names[base] = [end_name] return name_ return MakeFinalName current = 0 last_nobreak = False while current < len(html): m = _MESSAGE_NO_BREAK_COMMENT.match(html[current:]) if m: last_nobreak = True current += m.end() continue m = _NBSP.match(html[current:]) if m: parts.append((MakeNameClosure('SPACE'), m.group())) current += m.end() continue m = _REPLACEABLE.match(html[current:]) if m: # Replaceables allow - but placeholders don't, so replace - with _ ph_name = MakeNameClosure('X_%s_X' % m.group('name').replace('-', '_')) parts.append((ph_name, m.group())) current += m.end() continue m = _SPECIAL_ELEMENT.match(html[current:]) if m: if not include_block_tags: if last_nobreak: last_nobreak = False else: raise exception.BlockTagInTranslateableChunk(html) element_name = 'block' # for simplification # Get the appropriate group name for group in m.groupdict().keys(): if m.groupdict()[group]: break parts.append((MakeNameClosure(element_name, 'begin'), html[current:current + m.start(group)])) parts.append(m.group(group)) parts.append((MakeNameClosure(element_name, 'end'), html[current + m.end(group):current + m.end()])) current += m.end() continue m = _ELEMENT.match(html[current:]) if m: element_name = m.group('element').lower() if not include_block_tags and not element_name in _INLINE_TAGS: if last_nobreak: last_nobreak = False else: raise exception.BlockTagInTranslateableChunk( html[current:]) if element_name in _HTML_PLACEHOLDER_NAMES: # use meaningful names element_name = _HTML_PLACEHOLDER_NAMES[element_name] # Make a name for the placeholder type = '' if not m.group('empty'): if m.group('closing'): type = 'end' else: type = 'begin' parts.append((MakeNameClosure(element_name, type), m.group())) current += m.end() continue if len(parts) and isinstance(parts[-1], types.StringTypes): parts[-1] += html[current] else: parts.append(html[current]) current += 1 msg_text = '' placeholders = [] for part in parts: if isinstance(part, types.TupleType): final_name = part[0]() original = part[1] msg_text += final_name placeholders.append( tclib.Placeholder(final_name, original, '(HTML code)')) else: msg_text += part msg = tclib.Message(text=msg_text, placeholders=placeholders, description=description) content = msg.GetContent() for ix in range(len(content)): if isinstance(content[ix], types.StringTypes): content[ix] = util.UnescapeHtml(content[ix], replace_nbsp=False) return msg
def testUnescapeHtml(self): self.failUnless(util.UnescapeHtml('ϲ') == unichr(1010)) self.failUnless(util.UnescapeHtml('ꯍ') == unichr(43981))