def remove_nodes_via_regex(doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = doc.xpath(reg, namespaces={'re': REGEX_NS}) for node in naughty_list: parser.remove(node) return doc
def get_replacement_nodes(doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = parser.child_nodes_with_text(div) for kid in childs: # node is a p # and already have some replacement text if parser.get_tag(kid) == 'p' and len(replacement_text) > 0: new_node = get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif parser.is_text_node(kid): kid_text_node = kid kid_text = parser.get_text(kid) replace_text = kid_text for p, w in TABS_AND_NEWLINES: replace_text = replace_text.replace(p, w) if len(replace_text) > 1: prev_sib_node = parser.previous_sibling(kid_text_node) while prev_sib_node is not None \ and parser.get_tag(prev_sib_node) == "a" \ and parser.get_attribute(prev_sib_node, 'usedalready') != 'yes': outer = " " + parser.outer_html(prev_sib_node) + " " replacement_text.append(outer) nodes_to_remove.append(prev_sib_node) parser.set_attribute(prev_sib_node, attr='usedalready', value='yes') prev = parser.previous_sibling(prev_sib_node) prev_sib_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_Sib_node = parser.next_sibling(kid_text_node) while next_Sib_node is not None \ and parser.get_tag(next_Sib_node) == "a" \ and parser.get_attribute(next_Sib_node, 'usedalready') != 'yes': outer = " " + parser.outer_html(next_Sib_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_Sib_node) parser.set_attribute(next_Sib_node, attr='usedalready', value='yes') next = parser.next_sibling(next_Sib_node) prev_sib_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if len(replacement_text) > 0: new_node = get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: parser.remove(n) return nodes_to_return
def is_table_tag_and_no_paragraphs_exist(e): sub_paragraphs = parser.get_elements_by_tag(e, tag='p') for p in sub_paragraphs: txt = parser.get_text(p) if len(txt) < 25: parser.remove(p) sub_paragraphs2 = parser.get_elements_by_tag(e, tag='p') if len(sub_paragraphs2) == 0 and e.tag is not "td": return True return False
def cleanup(target_node): """ Remove any divs that looks like non-content, Clusters of links, or paras with no gusto. """ node = add_siblings(target_node) for e in node.getchildren(): if e.tag != 'p': if is_high_link_density(e) \ or is_table_tag_and_no_paragraphs_exist(e) \ or not node_score_threshold_met(node, e): parser.remove(e) return node
def remove_paragraphs_with_few_words(top_node): """ Remove paragraphs that have less than x number of words, would indicate that it's some sort of link. """ all_nodes = parser.get_elements_by_tags(top_node, ['*']) all_nodes.reverse() for el in all_nodes: text = parser.get_text(el) stop_words = StopWords().get_stop_word_count(text) if stop_words.get_stop_word_count() < 3 \ and len(parser.get_elements_by_tag(el, tag='object')) == 0 \ and len(parser.get_elements_by_tag(el, tag='embed')) == 0: parser.remove(el) # TODO: Check if it is in the right place. else: trimmed = parser.get_text(el) if trimmed.startswith("(") and trimmed.endswith(")"): parser.remove(el)
def remove_script_and_style(doc): # remove scripts scripts = parser.get_elements_by_tag(doc, tag='script') for item in scripts: parser.remove(item) # remove styles styles = parser.get_elements_by_tag(doc, tag='style') for item in styles: parser.remove(item) # remove comments comments = parser.get_comments(doc) for item in comments: parser.remove(item) return doc
def clean_bad_tags(doc): # ids naughty_list = doc.xpath(QUERY_IDS, namespaces={'re': REGEX_NS}) for node in naughty_list: parser.remove(node) # class naughty_classes = doc.xpath(QUERY_CLASSES, namespaces={'re': REGEX_NS}) for node in naughty_classes: parser.remove(node) # name naughty_names = doc.xpath(QUERY_NAMES, namespaces={'re': REGEX_NS}) for node in naughty_names: parser.remove(node) return doc