def title(soup, source_url=None): """ Extract meta title. """ for tag in TITLE_TAGS: data = _extract_tag_data(soup, tag) if data: return text.prepare(data) # fallback on page title return text.prepare(soup.title.text.strip())
def strip_tags(htmlstring): """ String tags and clean text from html. """ s = MLStripper() s.feed(htmlstring) raw_text = s.get_data() return text.prepare(raw_text)
def description(soup, source_url=None): """ Extract meta description. """ for tag in DESC_TAGS: data = _extract_tag_data(soup, tag) if data: return text.prepare(data)
def _prepare_str(o, field, source_url=None): """ Prepare text/html field """ if field not in o: return None if o[field] is None: return None if html.is_html(o[field]): return html.prepare(o[field], source_url) return text.prepare(o[field])
def prepare_str(o, field, source_url=None): """ Prepare text/html field """ if field not in o: return None if o[field] is None: return None if html.is_html(o[field]): return html.prepare(o[field], source_url) return text.prepare(o[field])
def prepare(htmlstring, source_url=None, safe_attrs=['src', 'href']): """ Cleanse an htmlstring of it's attributes, absolutify images and links, ascii-dammify it, and clean whitespace. """ if not htmlstring: return None cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=set(safe_attrs)) cleansed = cleaner.clean_html(htmlstring) soup = make_abs(cleansed, source_url) cleansed = get_inner(soup) return text.prepare(cleansed)