def plaintext(text, keeplinebreaks=True): if isinstance(text, Fragment): text = text.generate().render('text', encoding=None) else: text = stripentities(striptags(text)) if not keeplinebreaks: text = text.replace(u'\n', u' ') return text
def handle_starttag(self, tag, attrib): fixed_attrib = [] for name, value in attrib: # Fixup minimized attributes if value is None: value = name fixed_attrib.append((QName(name), stripentities(value))) self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) if tag in self._EMPTY_ELEMS: self._enqueue(END, QName(tag)) else: self._open_tags.append(tag)
def plaintext(text, keeplinebreaks=True): """Extract the text elements from (X)HTML content :param text: `unicode` or `genshi.builder.Fragment` :param keeplinebreaks: optionally keep linebreaks """ if isinstance(text, Fragment): text = text.generate().render('text', encoding=None) else: text = stripentities(striptags(text)) if not keeplinebreaks: text = text.replace(u'\n', u' ') return text
def handle_starttag(self, tag, attrib): fixed_attrib = [] for name, value in attrib: # Fixup minimized attributes if value is None: value = str(name) elif not isinstance(value, str): value = value.decode(self.encoding, 'replace') fixed_attrib.append((QName(name), stripentities(value))) self._enqueue(START, (QName(tag), Attrs(fixed_attrib))) if tag in self._EMPTY_ELEMS: self._enqueue(END, QName(tag)) else: self._open_tags.append(tag)
def __call__(self, stream): """Apply the filter to the given stream. :param stream: the markup event stream to filter """ waiting_for = None for kind, data, pos in stream: if kind is START: if waiting_for: continue tag, attrs = data if not self.is_safe_elem(tag, attrs): waiting_for = tag continue new_attrs = [] for attr, value in attrs: value = stripentities(value) if attr not in self.safe_attrs: continue elif attr in self.uri_attrs: # Don't allow URI schemes such as "javascript:" if not self.is_safe_uri(value): continue elif attr == 'style': # Remove dangerous CSS declarations from inline styles decls = self.sanitize_css(value) if not decls: continue value = '; '.join(decls) new_attrs.append((attr, value)) yield kind, (tag, Attrs(new_attrs)), pos elif kind is END: tag = data if waiting_for: if waiting_for == tag: waiting_for = None else: yield kind, data, pos elif kind is not COMMENT: if not waiting_for: yield kind, data, pos
def _to_python(self, value, state=None): value = super(MarkupConverter, self)._to_python(value, state) if value: value = Markup(stripentities(value)).unescape() return self.cleaner('', value)
def __call__(self, stream): """Apply the filter to the given stream. :param stream: the markup event stream to filter """ waiting_for = None def _get_scheme(href): if ':' not in href: return None chars = [char for char in href.split(':', 1)[0] if char.isalnum()] return ''.join(chars).lower() for kind, data, pos in stream: if kind is START: if waiting_for: continue tag, attrs = data if tag not in self.safe_tags: waiting_for = tag continue new_attrs = [] for attr, value in attrs: value = stripentities(value) if attr not in self.safe_attrs: continue elif attr in self.uri_attrs: # Don't allow URI schemes such as "javascript:" if _get_scheme(value) not in self.safe_schemes: continue elif attr == 'style': # Remove dangerous CSS declarations from inline styles decls = [] value = self._replace_unicode_escapes(value) for decl in filter(None, value.split(';')): is_evil = False if 'expression' in decl: is_evil = True for m in re.finditer(r'url\s*\(([^)]+)', decl): if _get_scheme(m.group(1)) not in self.safe_schemes: is_evil = True break if not is_evil: decls.append(decl.strip()) if not decls: continue value = '; '.join(decls) new_attrs.append((attr, value)) yield kind, (tag, Attrs(new_attrs)), pos elif kind is END: tag = data if waiting_for: if waiting_for == tag: waiting_for = None else: yield kind, data, pos else: if not waiting_for: yield kind, data, pos