class DocumentCleaner(object): def __init__(self, config, article): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = article # nodes to remove regexp self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|" "navbar|storytopbar-bucket|utility-bar|inline-share-tools" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt" "|^links$|meta$|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies") self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.caption_re = "^caption$" self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self): doc_to_clean = self.article.doc doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.remove_scripts_styles(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.convert_wanted_tags_to_paragraphs( doc_to_clean, ARTICLE_ROOT_TAGS) return doc_to_clean def clean_body_classes(self, doc): # we don't need body classes # in case it matches an unwanted class all the document # will be empty elements = self.parser.getElementsByTag(doc, tag="body") if elements: self.parser.delAttribute(elements[0], attr="class") return doc def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def clean_em_tags(self, doc): ems = self.parser.getElementsByTag(doc, tag='em') for node in ems: images = self.parser.getElementsByTag(node, tag='img') if len(images) == 0: self.parser.drop_tag(node) return doc def remove_drop_caps(self, doc): items = self.parser.css_select( doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if (len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling( kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml( previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling( previous_sibling_node) previous_sibling_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml( next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if (len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def convert_wanted_tags_to_paragraphs(self, doc, wanted_tags): selected = self.parser.getElementsByTags(doc, wanted_tags) for elem in selected: if not self.parser.getElementsByTags(elem, BLOCK_ELEMENT_TAGS): self.replace_with_para(doc, elem) else: replaceNodes = self.get_replacement_nodes(doc, elem) elem.clear() for c, n in enumerate(replaceNodes): elem.insert(c, n) return doc
class DocumentCleaner(object): def __init__(self, config): self.config = config # parser self.parser = self.config.get_parser() self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.caption_re = "^caption$" self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self, article): doc_to_clean = article.doc doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.remove_scripts_styles(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.div_to_para(doc_to_clean, 'div') doc_to_clean = self.div_to_para(doc_to_clean, 'span') return doc_to_clean def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def clean_em_tags(self, doc): ems = self.parser.getElementsByTag(doc, tag='em') for node in ems: images = self.parser.getElementsByTag(node, tag='img') if len(images) == 0: self.parser.drop_tag(node) return doc def remove_drop_caps(self, doc): items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p > span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if(len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling(kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling(previous_sibling_node) previous_sibling_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if(len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def div_to_para(self, doc, dom_type): bad_divs = 0 else_divs = 0 divs = self.parser.getElementsByTag(doc, tag=dom_type) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = self.parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replace_with_para(doc, div) bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) else_divs += 1 return doc
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp" ) self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.facebookBroadcastingPattern = "facebook-broadcasting" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence().create("\n", "\n\n").append("\t").append("^\\s+$") def clean(self, article): docToClean = article.doc docToClean = self.cleanEmTags(docToClean) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeScriptsAndStyles(docToClean) docToClean = self.cleanBadTags(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, "div") docToClean = self.convertDivsToParagraphs(docToClean, "span") return docToClean def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag="em") for node in ems: images = Parser.getElementsByTag(node, tag="img") if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc def removeScriptsAndStyles(self, doc): # remove scripts scripts = Parser.getElementsByTag(doc, tag="script") for item in scripts: Parser.remove(item) # remove styles styles = Parser.getElementsByTag(doc, tag="style") for item in styles: Parser.remove(item) # remove comments comments = Parser.getComments(doc) for item in comments: Parser.remove(item) return doc def cleanBadTags(self, doc): # ids naughtyList = doc.xpath(self.queryNaughtyIDs, namespaces={"re": self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = doc.xpath(self.queryNaughtyClasses, namespaces={"re": self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = doc.xpath(self.queryNaughtyNames, namespaces={"re": self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ["id", "class"]: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = doc.xpath(reg, namespaces={"re": self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = doc.cssselect("p > span") for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == "p" and len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while ( prevSibNode is not None and Parser.getTag(prevSibNode) == "a" and Parser.getAttribute(prevSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr="grv-usedalready", value="yes") prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while ( nextSibNode is not None and Parser.getTag(nextSibNode) == "a" and Parser.getAttribute(nextSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr="grv-usedalready", value="yes") next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, "p") def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = ["a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) elseDivs += 1 return doc
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp") self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self, article): docToClean = article.doc docToClean = self.cleanEmTags(docToClean) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeScriptsAndStyles(docToClean) docToClean = self.cleanBadTags(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, 'div') docToClean = self.convertDivsToParagraphs(docToClean, 'span') return docToClean def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag='em') for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]", doc) for item in items: item.drop_tag() return doc def removeScriptsAndStyles(self, doc): # remove scripts scripts = Parser.getElementsByTag(doc, tag='script') for item in scripts: Parser.remove(item) # remove styles styles = Parser.getElementsByTag(doc, tag='style') for item in styles: Parser.remove(item) # remove comments comments = Parser.getComments(doc) for item in comments: Parser.remove(item) return doc def cleanBadTags(self, doc): # ids naughtyList = cache.xpath(self.queryNaughtyIDs, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = cache.xpath(self.queryNaughtyClasses, doc, namespaces={'re': self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = cache.xpath(self.queryNaughtyNames, doc, namespaces={'re': self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = cache.cssselect('p > span', doc) for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll( kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if (len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] # for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, 'p') def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = [ 'a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul' ] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) elseDivs += 1 return doc
class DocumentCleaner(object): def __init__(self, config, article): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = article # nodes to remove regexp self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|" "navbar|storytopbar-bucket|utility-bar|inline-share-tools" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt" "|^links$|meta$|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies|printfriendly|share" ) # dailymail remove nodes self.remove_nodes_re += "|related-carousel|xwv-related-videos-container" # nytimes remove nodes self.remove_nodes_re += "|visually-hidden|robots-nocontent" # *.wikipedia.org self.remove_nodes_re += "|mw-editsection|^cite_ref|noprint|References|siteSub" self.remove_nodes_re += "|collapsed|mw-headline-anchor|filetoc|noviewer" # *.wiktionary.org self.remove_nodes_re += "|ib-brac" # *.wikibooks.org self.remove_nodes_re += "|status-icon" # www.wikidata.org self.remove_nodes_re += "|wikibase-edittoolbar-container" # http://www.dailymail.co.uk/news/article-2742786/Complacent-Home-Office-loses-175-000-illegal-immigrants-Fresh-humiliation-officials-admit-went-missing-refused-permission-stay.html self.remove_nodes_re += "|most-read-news-wrapper|most-watched-videos-wrapper" self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.nauthy_tags = ["noscript"] self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def set_known_host_remove_selectors(self): self.known_host_remove_selectors = HostUtils.host_selectors(_Const().get_known_host_remove_selectors, self.article.domain) def clean(self): doc_to_clean = self.article.doc doc_to_clean = self.remove_scripts_styles(doc_to_clean) self.set_known_host_remove_selectors() if self.known_host_remove_selectors: return self.remove_host_specific_nodes(doc_to_clean) doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.div_to_para(doc_to_clean, 'div') doc_to_clean = self.div_to_para(doc_to_clean, 'span') return doc_to_clean def clean_body_classes(self, doc): # we don't need body classes # in case it matches an unwanted class all the document # will be empty elements = self.parser.getElementsByTag(doc, tag="body") if elements: self.parser.delAttribute(elements[0], attr="class") return doc def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def remove_drop_caps(self, doc): items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) for nauthy_tag in self.nauthy_tags: nodes = self.parser.getElementsByTag(doc, tag=nauthy_tag) for node in nodes: images = self.parser.getElementsByTag(node, tag='img') if images: parent = node.getparent() parent_index = parent.index(node) for image in images: parent.insert(parent_index, image) else: self.parser.remove(node) return doc def remove_host_specific_nodes(self, doc): nodes = self.parser.css_select(doc, self.known_host_remove_selectors) for node in nodes: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if(len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling(kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling(previous_sibling_node) previous_sibling_node = prev if prev is not None else None next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if(len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def div_to_para(self, doc, dom_type): bad_divs = 0 else_divs = 0 divs = self.parser.getElementsByTag(doc, tag=dom_type) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = self.parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replace_with_para(doc, div) bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) for child in self.parser.childNodes(div): div.remove(child) for c, n in enumerate(replaceNodes): div.insert(c, n) else_divs += 1 return doc
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|fontresize|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers|rating" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|articlead" "|date|^print$|popup|author-dropdown|tools|socialtools" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp|menu" ) self.regExNotRemoveNodes = ("and|no|article|body|column|main|shadow") self.regexpNS = "http://exslt.org/regular-expressions" self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.facebookBroadcastingPattern = "facebook-broadcasting" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") self.todel = self.regExRemoveNodes.lower().split('|') self.notdel = self.regExNotRemoveNodes.lower().split('|') def clean(self, article): docToClean = article.doc nodelist = self.getNodesToDelete(docToClean) for node in nodelist: Parser.remove(node) docToClean = self.removeListsWithLinks(docToClean) docToClean = self.dropTags(docToClean,['em','strong']) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.keepLineBreaks(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, 'div') docToClean = self.convertDivsToParagraphs(docToClean, 'span') return docToClean def getNodesToDelete(self, doc): nodelist = [] for node in doc: if node.tag in ['script','noscript','style','option'] or isinstance(node,lxml.html.HtmlComment): nodelist.append(node) continue if node.tag in ['p','span','b','h1','h2','h3','h4','h5'] and len(node) == 0: continue; ids = '' if node.attrib.has_key('class'): ids += ' ' + node.attrib['class'].lower() if node.attrib.has_key('id'): ids += ' ' + node.attrib['id'].lower() if node.attrib.has_key('name'): ids += ' ' + node.attrib['name'].lower() good_word = '' for word in self.notdel: if ids.find(word) >= 0: good_word = word continue bad_word = '' for word in self.todel: if ids.find(word) >= 0: bad_word = word break if (bad_word != '' and good_word == '') or (bad_word != '' and bad_word.find(good_word) >= 0): nodelist.append(node) continue nodelist += self.getNodesToDelete(node) return nodelist def keepLineBreaks(self, doc): items=Parser.getElementsByTag(doc, tag='br') for n in items: if n.tail is not None: n.tail = u'\ufffc ' + n.tail else: n.tail = u'\ufffc' n.drop_tag() items=Parser.getElementsByTag(doc, tag='p') for n in items: if n.tail is not None: n.tail = u'\ufffc ' + n.tail else: n.tail = u'\ufffc' # if n.text is None: n.drop_tag() # drop empty p return doc def removeWrapedLinks(self, e): if e is None or len(e) != 1 or e[0].tag != 'a': return [] text = '' if e.text is not None: text += e.text if e[0].tail is not None: text += e[0].tail if e.tail is not None: text += e.tail if re.search('[^ \t\r\n]',text): return [] toRemove = [e] + self.removeWrapedLinks(Parser.nextSibling(e)) return toRemove def removeListsWithLinks(self, doc): for tag in ['ol','ul']: items=Parser.getElementsByTag(doc, tag=tag) for item in items: fa = 0 for li in item: if Parser.getElementsByTag(li, tag='a'): fa += 1 if fa > 2: parent = item.getparent() Parser.remove(item) if parent is not None: if len(parent) == 0 or len(Parser.getText(parent).split()) < 4: Parser.remove(parent) break else: fa = 0 items=Parser.getElementsByTag(doc, tag='a') for a in items: e = a.getparent() if e is None: continue text = Parser.getText(e) ldels = [] textcount = 0 for link in e: ltext = Parser.getText(link) if link.tag != 'a' and len(ltext) <= 2: continue if link.tag != 'a' and len(ltext) > 2: ldels = [] break if ltext == '': continue ldel = text.split(ltext,1) ld = ldel[0].strip() ldels.append(ld) if len(ldel) == 1: break text = ldel[1] if len(ldels) == 0 or ldels[0] == ',': continue else: del ldels[0] flag = 0; flag1 = 0; flag2 = 0; flag3 = 0 for ldel in ldels: if ldel == ldels[0]: flag += 1 if len(ldel) > 3 or ldel.find(',') >= 0: flag1 = 1 if ldel != '': flag2 = 1 if len(ldel) > 1: flag3 = 1 if flag2 == 0 and len(ldels) > 1: Parser.remove(e) continue if len(ldels) == 2 and ldels[0] == '|' and ldels[1] == '|': Parser.remove(e) continue if len(ldels) > 3 and flag3 == 0: Parser.remove(e) continue if flag <= 2 and (len(ldels) <= 2 or flag1 != 0): continue Parser.remove(e) return doc items=Parser.getElementsByTag(doc, tag='a') for a in items: e = a.getparent() if e is None: continue if len(e) == 1: toRemove = self.removeWrapedLinks(e) if len(toRemove) > 2: for bn in toRemove: Parser.remove(bn) return doc def dropTags(self, doc, tags): for tag in tags: ems = Parser.getElementsByTag(doc, tag=tag) for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = doc.xpath(reg, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = doc.cssselect('p > span') for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if(len(replaceText)) > 0: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.previousSibling(prevSibNode) # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.nextSibling(nextSibNode) # otherwise else: if Parser.getTag(kid) == "a" and Parser.getAttribute(kid, 'grv-usedalready') == 'yes': continue if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # flush out anything still remaining if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, 'p') def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) text = div.tail div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) div.tail = text elseDivs += 1 return doc