Python clean Examples

Programming Language: Python

Namespace/Package Name: htmls

Method/Function: clean

Examples at hotexamples.com: 6

Python clean - 6 examples found. These are the top rated real world Python examples of htmls.clean extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: readability.py Project: yishh/lxml-readability

def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
            continue
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:
            continue

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)
            ordered.append(parent_node)

        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)
            ordered.append(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)

        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
            candidates[grand_parent_node][
                'content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" %
                      (score, describe(elem), ld, score * (1 - ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates

Example #2

Show file

File: readability.py Project: yinhm/lxml-readability

def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
            continue 
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:
            continue

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)
            ordered.append(parent_node)
            
        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)
            ordered.append(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)
            
        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
            candidates[grand_parent_node]['content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates

Example #3

Show file

File: readability.py Project: yinhm/lxml-readability

def text_length(i):
    return len(clean(i.text_content() or ""))

Example #4

Show file

File: multi_page.py Project: yishh/lxml-readability

def eval_link_text(link):
    link_text = clean(link.text_content() or '')
    if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
        return link_text, False
    else:
        return link_text, True

Example #5

Show file

File: readability.py Project: yishh/lxml-readability

def text_length(i):
    return len(clean(i.text_content() or ""))

Example #6

Show file

File: multi_page.py Project: Sadhanandh/Chat-thumbnailer

def eval_link_text(link):
    link_text = clean(link.text_content() or '')
    if REGEXES['extraneous'].search(link_text) or len(link_text) > 25:
        return link_text, False
    else:
        return link_text, True