def process_html(cls, text: str, environment: TypographEnvironment = None, encoding: str = 'utf-8', autolink: bool = True) -> str: """ :param text: html text :param environment: TypographSettings :param encoding: text endocing :param autolink: convert all in-text url adress into links :return: """ if environment is None: environment = get_default_environment() node = html.fromstring(text) cls._process_node(node, environment) if autolink: clean.autolink(node, [cls.AUTOLINK_REGEX], avoid_hosts=[]) text_processed = html.tostring(node, encoding=encoding).decode(encoding) return text_processed.replace(cls.VERBATIM, '&')
def _to_python(self, value, state): try: from lxml.html.clean import Cleaner, autolink, word_break from lxml.html import fragment_fromstring from lxml.etree import tostring allowed_attributes = ['href', 'target', 'rel', 'name', 'title', 'src', 'width', 'height', 'alt'] cleaner = Cleaner(style=True, add_nofollow=True, allow_tags=['div', 'p', 'br', 'a', 'strong', 'b', 'blockquote', 'em', 'i', 'img', 'u', 's', 'del'], remove_unknown_tags=False) fragment = fragment_fromstring(value.replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>"), create_parent='div') for element in fragment.xpath('//*[@*]'): for ek in element.attrib.iterkeys(): if ek not in allowed_attributes: del element.attrib[ek] cleaner(fragment) autolink(fragment) word_break(fragment) return tostring(fragment) except: raise formencode.Invalid(self.message('invalidHTML', state), value, state)
def fix_links(doc): autolink(doc) for link in doc.xpath('//a[@href]'): link.attrib['target'] = '_blank' return doc
def autolink(doc): """Replace http:// strings (texts) with HTML links. See http://lxml.de/lxmlhtml.html, section autolink.""" clean.autolink(doc) return doc