def clean_xhtml(string, p_wrap=True, _cleaner_settings=None): """Convert the given plain text or HTML into valid XHTML. If there is no markup in the string, apply paragraph formatting. :param string: XHTML input string :type string: unicode :param p_wrap: Wrap the output in <p></p> tags? :type p_wrap: bool :param _cleaner_settings: Constructor kwargs for :class:`mediadrop.lib.htmlsanitizer.Cleaner` :type _cleaner_settings: dict :returns: XHTML :rtype: unicode """ if not string or not string.strip(): # If the string is none, or empty, or whitespace return u"" if _cleaner_settings is None: _cleaner_settings = cleaner_settings # remove carriage return chars; FIXME: is this necessary? string = string.replace(u"\r", u"") # remove non-breaking-space characters. FIXME: is this necessary? string = string.replace(u"\xa0", u" ") string = string.replace(u" ", u" ") # replace all blank lines with <br> tags string = blank_line.sub(u"<br/>", string) # initialize and run the cleaner string = Cleaner(string, **_cleaner_settings)() # FIXME: It's possible that the rename_tags operation creates # some invalid nesting. e.g. # >>> c = Cleaner("", "rename_tags", elem_map={'h2': 'p'}) # >>> c('<p><h2>head</h2></p>') # u'<p><p>head</p></p>' # This is undesirable, so here we... just re-parse the markup. # But this ... could be pretty slow. cleaner = Cleaner(string, **_cleaner_settings) string = cleaner() # Wrap in a <p> tag when no tags are used, and there are no blank # lines to trigger automatic <p> creation # FIXME: This should trigger any time we don't have a wrapping block tag # FIXME: This doesn't wrap orphaned text when it follows a <p> tag, for ex if p_wrap \ and len(cleaner.root.contents) == 1 \ and isinstance(cleaner.root.contents[0], basestring): string = u"<p>%s</p>" % string.strip() # strip all whitespace from immediately before/after block-level elements string = block_spaces.sub(u"\\1", string) return string.strip()
def truncate_xhtml(string, size, _strip_xhtml=False, _decode_entities=False): """Truncate a XHTML string to roughly a given size (full words). :param string: XHTML :type string: unicode :param size: Max length :param _strip_xhtml: Flag to strip out all XHTML :param _decode_entities: Flag to convert XHTML entities to unicode chars :rtype: unicode """ if not string: return u'' if _strip_xhtml: # Insert whitespace after block elements. # So they are separated when we strip the xhtml. string = block_spaces.sub(u"\\1 ", string) string = strip_xhtml(string) string = decode_entities(string) if len(string) > size: string = text.truncate(string, length=size, whole_word=True) if _strip_xhtml: if not _decode_entities: # re-encode the entities, if we have to. string = encode_entities(string) else: if _decode_entities: string = Cleaner(string, *truncate_filters, **cleaner_settings)() else: # re-encode the entities, if we have to. string = Cleaner(string, 'encode_xml_specials', *truncate_filters, **cleaner_settings)() return string.strip()