Beispiel #1
0
def headings(doc):
    headers = []
    nodes = parser.get_elements_by_tags(doc, ['h1', 'h2', 'h3', 'h4', 'h5'])
    for node in nodes:
        if node.text and node.text.strip():
            headers.append(node.text.strip())
    return headers
Beispiel #2
0
def convert_div_to_p(doc, dom_type):
    bad_divs = 0
    else_divs = 0
    divs = parser.get_elements_by_tag(doc, tag=dom_type)
    tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre',
            'table', 'ul']
    for div in divs:
        items = parser.get_elements_by_tags(div, tags)
        if div is not None and len(items) == 0:
            replace_elements_with_p(doc, div)
            bad_divs += 1
        elif div is not None:
            replace_nodes = get_replacement_nodes(doc, div)
            div.clear()
            for c, n in enumerate(replace_nodes):
                div.insert(c, n)
            else_divs += 1
    return doc
Beispiel #3
0
def remove_paragraphs_with_few_words(top_node):
    """
    Remove paragraphs that have less than x number of words,  would
    indicate that it's some sort of link.
    """
    all_nodes = parser.get_elements_by_tags(top_node, ['*'])
    all_nodes.reverse()
    for el in all_nodes:
        text = parser.get_text(el)
        stop_words = StopWords().get_stop_word_count(text)
        if stop_words.get_stop_word_count() < 3 \
            and len(parser.get_elements_by_tag(el, tag='object')) == 0 \
            and len(parser.get_elements_by_tag(el, tag='embed')) == 0:
            parser.remove(el)
        # TODO: Check if it is in the right place.
        else:
            trimmed = parser.get_text(el)
            if trimmed.startswith("(") and trimmed.endswith(")"):
                parser.remove(el)