Exemple #1
0
def title(soup, source_url=None):
    """
    Extract meta title.
    """
    for tag in TITLE_TAGS:
        data = _extract_tag_data(soup, tag)
        if data:
            return text.prepare(data)

    # fallback on page title
    return text.prepare(soup.title.text.strip())
Exemple #2
0
def title(soup, source_url=None):
    """
    Extract meta title.
    """
    for tag in TITLE_TAGS:
        data = _extract_tag_data(soup, tag)
        if data:
            return text.prepare(data)

    # fallback on page title
    return text.prepare(soup.title.text.strip())
Exemple #3
0
def strip_tags(htmlstring):
    """
    String tags and clean text from html.
    """
    s = MLStripper()
    s.feed(htmlstring)
    raw_text = s.get_data()
    return text.prepare(raw_text)
Exemple #4
0
def description(soup, source_url=None):
    """
    Extract meta description.
    """
    for tag in DESC_TAGS:
        data = _extract_tag_data(soup, tag)
        if data:
            return text.prepare(data)
Exemple #5
0
def strip_tags(htmlstring):
    """
    String tags and clean text from html.
    """
    s = MLStripper()
    s.feed(htmlstring)
    raw_text = s.get_data()
    return text.prepare(raw_text)
Exemple #6
0
def description(soup, source_url=None):
    """
    Extract meta description.
    """
    for tag in DESC_TAGS:
        data = _extract_tag_data(soup, tag)
        if data:
            return text.prepare(data)
Exemple #7
0
def _prepare_str(o, field, source_url=None):
    """
    Prepare text/html field
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if html.is_html(o[field]):
        return html.prepare(o[field], source_url)
    return text.prepare(o[field])
def prepare_str(o, field, source_url=None):
    """
    Prepare text/html field
    """
    if field not in o:
        return None
    if o[field] is None:
        return None
    if html.is_html(o[field]):
        return html.prepare(o[field], source_url)
    return text.prepare(o[field])
Exemple #9
0
def prepare(htmlstring, source_url=None, safe_attrs=['src', 'href']):
    """
    Cleanse an htmlstring of it's attributes,
    absolutify images and links, ascii-dammify it,
    and clean whitespace.
    """
    if not htmlstring:
        return None
    cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=set(safe_attrs))
    cleansed = cleaner.clean_html(htmlstring)
    soup = make_abs(cleansed, source_url)
    cleansed = get_inner(soup)
    return text.prepare(cleansed)
Exemple #10
0
def prepare(htmlstring, source_url=None, safe_attrs=['src', 'href']):
    """
    Cleanse an htmlstring of it's attributes,
    absolutify images and links, ascii-dammify it,
    and clean whitespace.
    """
    if not htmlstring:
        return None
    cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=set(safe_attrs))
    cleansed = cleaner.clean_html(htmlstring)
    soup = make_abs(cleansed, source_url)
    cleansed = get_inner(soup)
    return text.prepare(cleansed)