def __init__(self, config): self.config = config # parser self.parser = self.config.get_parser() self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.caption_re = "^caption$" self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$")
def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp") self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$")
def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp" ) self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.facebookBroadcastingPattern = "facebook-broadcasting" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence().create("\n", "\n\n").append("\t").append("^\\s+$")
class DocumentCleaner(object): def __init__(self, config, article): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = article # nodes to remove regexp self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|" "navbar|storytopbar-bucket|utility-bar|inline-share-tools" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt" "|^links$|meta$|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies") self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.caption_re = "^caption$" self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self): doc_to_clean = self.article.doc doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.remove_scripts_styles(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.convert_wanted_tags_to_paragraphs( doc_to_clean, ARTICLE_ROOT_TAGS) return doc_to_clean def clean_body_classes(self, doc): # we don't need body classes # in case it matches an unwanted class all the document # will be empty elements = self.parser.getElementsByTag(doc, tag="body") if elements: self.parser.delAttribute(elements[0], attr="class") return doc def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def clean_em_tags(self, doc): ems = self.parser.getElementsByTag(doc, tag='em') for node in ems: images = self.parser.getElementsByTag(node, tag='img') if len(images) == 0: self.parser.drop_tag(node) return doc def remove_drop_caps(self, doc): items = self.parser.css_select( doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if (len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling( kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml( previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling( previous_sibling_node) previous_sibling_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml( next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if (len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def convert_wanted_tags_to_paragraphs(self, doc, wanted_tags): selected = self.parser.getElementsByTags(doc, wanted_tags) for elem in selected: if not self.parser.getElementsByTags(elem, BLOCK_ELEMENT_TAGS): self.replace_with_para(doc, elem) else: replaceNodes = self.get_replacement_nodes(doc, elem) elem.clear() for c, n in enumerate(replaceNodes): elem.insert(c, n) return doc
class DocumentCleaner(object): def __init__(self, config): self.config = config # parser self.parser = self.config.get_parser() self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies" ) self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.caption_re = "^caption$" self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self, article): doc_to_clean = article.doc doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.clean_em_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.remove_scripts_styles(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.caption_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.div_to_para(doc_to_clean, 'div') doc_to_clean = self.div_to_para(doc_to_clean, 'span') return doc_to_clean def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def clean_em_tags(self, doc): ems = self.parser.getElementsByTag(doc, tag='em') for node in ems: images = self.parser.getElementsByTag(node, tag='img') if len(images) == 0: self.parser.drop_tag(node) return doc def remove_drop_caps(self, doc): items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p > span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if(len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling(kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling(previous_sibling_node) previous_sibling_node = prev if prev is not None else None # append replace_text replacement_text.append(replace_text) # next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if(len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def div_to_para(self, doc, dom_type): bad_divs = 0 else_divs = 0 divs = self.parser.getElementsByTag(doc, tag=dom_type) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = self.parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replace_with_para(doc, div) bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) else_divs += 1 return doc
See the License for the specific language governing permissions and limitations under the License. """ import re from copy import deepcopy from urlparse import urlparse, urljoin from goose.utils import StringSplitter from goose.utils import StringReplacement from goose.utils import ReplaceSequence from goose.text import StopWords from goose.parsers import Parser MOTLEY_REPLACEMENT = StringReplacement("�", "") ESCAPED_FRAGMENT_REPLACEMENT = StringReplacement(u"#!", u"?_escaped_fragment_=") TITLE_REPLACEMENTS = ReplaceSequence().create(u"»").append(u"»") PIPE_SPLITTER = StringSplitter("\\|") DASH_SPLITTER = StringSplitter(" - ") ARROWS_SPLITTER = StringSplitter("»") COLON_SPLITTER = StringSplitter(":") SPACE_SPLITTER = StringSplitter(' ') NO_STRINGS = set() # TODO # A_REL_TAG_SELECTOR = "a[rel=tag], a[href*=/tag/]" A_REL_TAG_SELECTOR = "a[rel=tag]" RE_LANG = r'^[A-Za-z]{2}$' class ContentExtractor(object): def __init__(self, config): self.config = config
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp" ) self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.facebookBroadcastingPattern = "facebook-broadcasting" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence().create("\n", "\n\n").append("\t").append("^\\s+$") def clean(self, article): docToClean = article.doc docToClean = self.cleanEmTags(docToClean) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeScriptsAndStyles(docToClean) docToClean = self.cleanBadTags(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, "div") docToClean = self.convertDivsToParagraphs(docToClean, "span") return docToClean def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag="em") for node in ems: images = Parser.getElementsByTag(node, tag="img") if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc def removeScriptsAndStyles(self, doc): # remove scripts scripts = Parser.getElementsByTag(doc, tag="script") for item in scripts: Parser.remove(item) # remove styles styles = Parser.getElementsByTag(doc, tag="style") for item in styles: Parser.remove(item) # remove comments comments = Parser.getComments(doc) for item in comments: Parser.remove(item) return doc def cleanBadTags(self, doc): # ids naughtyList = doc.xpath(self.queryNaughtyIDs, namespaces={"re": self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = doc.xpath(self.queryNaughtyClasses, namespaces={"re": self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = doc.xpath(self.queryNaughtyNames, namespaces={"re": self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ["id", "class"]: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = doc.xpath(reg, namespaces={"re": self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = doc.cssselect("p > span") for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == "p" and len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while ( prevSibNode is not None and Parser.getTag(prevSibNode) == "a" and Parser.getAttribute(prevSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr="grv-usedalready", value="yes") prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while ( nextSibNode is not None and Parser.getTag(nextSibNode) == "a" and Parser.getAttribute(nextSibNode, "grv-usedalready") != "yes" ): outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr="grv-usedalready", value="yes") next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if len(replacementText) > 0: newNode = self.getFlushedBuffer("".join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, "p") def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = ["a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) elseDivs += 1 return doc
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp") self.regexpNS = "http://exslt.org/regular-expressions" self.queryNaughtyIDs = "//*[re:test(@id, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyClasses = "//*[re:test(@class, '%s', 'i')]" % self.regExRemoveNodes self.queryNaughtyNames = "//*[re:test(@name, '%s', 'i')]" % self.regExRemoveNodes self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def clean(self, article): docToClean = article.doc docToClean = self.cleanEmTags(docToClean) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeScriptsAndStyles(docToClean) docToClean = self.cleanBadTags(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, 'div') docToClean = self.convertDivsToParagraphs(docToClean, 'span') return docToClean def cleanEmTags(self, doc): ems = Parser.getElementsByTag(doc, tag='em') for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = cache.cssselect("span[class~=dropcap], span[class~=drop_cap]", doc) for item in items: item.drop_tag() return doc def removeScriptsAndStyles(self, doc): # remove scripts scripts = Parser.getElementsByTag(doc, tag='script') for item in scripts: Parser.remove(item) # remove styles styles = Parser.getElementsByTag(doc, tag='style') for item in styles: Parser.remove(item) # remove comments comments = Parser.getComments(doc) for item in comments: Parser.remove(item) return doc def cleanBadTags(self, doc): # ids naughtyList = cache.xpath(self.queryNaughtyIDs, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) # class naughtyClasses = cache.xpath(self.queryNaughtyClasses, doc, namespaces={'re': self.regexpNS}) for node in naughtyClasses: Parser.remove(node) # name naughtyNames = cache.xpath(self.queryNaughtyNames, doc, namespaces={'re': self.regexpNS}) for node in naughtyNames: Parser.remove(node) return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = cache.xpath(reg, doc, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = cache.cssselect('p > span', doc) for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll( kidText) if (len(replaceText)) > 1: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prev = Parser.previousSibling(prevSibNode) prevSibNode = prev if prev is not None else None # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') next = Parser.nextSibling(nextSibNode) prevSibNode = next if next is not None else None # otherwise else: nodesToReturn.append(kid) # flush out anything still remaining if (len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] # for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, 'p') def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = [ 'a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul' ] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) elseDivs += 1 return doc
def __init__(self, config, article): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = article # nodes to remove regexp self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|" "navbar|storytopbar-bucket|utility-bar|inline-share-tools" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt" "|^links$|meta$|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies|printfriendly|share" ) # dailymail remove nodes self.remove_nodes_re += "|related-carousel|xwv-related-videos-container" # nytimes remove nodes self.remove_nodes_re += "|visually-hidden|robots-nocontent" # *.wikipedia.org self.remove_nodes_re += "|mw-editsection|^cite_ref|noprint|References|siteSub" self.remove_nodes_re += "|collapsed|mw-headline-anchor|filetoc|noviewer" # *.wiktionary.org self.remove_nodes_re += "|ib-brac" # *.wikibooks.org self.remove_nodes_re += "|status-icon" # www.wikidata.org self.remove_nodes_re += "|wikibase-edittoolbar-container" # http://www.dailymail.co.uk/news/article-2742786/Complacent-Home-Office-loses-175-000-illegal-immigrants-Fresh-humiliation-officials-admit-went-missing-refused-permission-stay.html self.remove_nodes_re += "|most-read-news-wrapper|most-watched-videos-wrapper" self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.nauthy_tags = ["noscript"] self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$")
class DocumentCleaner(object): def __init__(self, config, article): # config self.config = config # parser self.parser = self.config.get_parser() # article self.article = article # nodes to remove regexp self.remove_nodes_re = ( "^side$|combx|retweet|mediaarticlerelated|menucontainer|" "navbar|storytopbar-bucket|utility-bar|inline-share-tools" "|comment|PopularQuestions|contact|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|cnn_html_slideshow|cnn_strylftcntnt" "|^links$|meta$|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings" "|date|^print$|popup|author-dropdown|tools|socialtools|byline" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|legende|ajoutVideo|timestamp|js_replies|printfriendly|share" ) # dailymail remove nodes self.remove_nodes_re += "|related-carousel|xwv-related-videos-container" # nytimes remove nodes self.remove_nodes_re += "|visually-hidden|robots-nocontent" # *.wikipedia.org self.remove_nodes_re += "|mw-editsection|^cite_ref|noprint|References|siteSub" self.remove_nodes_re += "|collapsed|mw-headline-anchor|filetoc|noviewer" # *.wiktionary.org self.remove_nodes_re += "|ib-brac" # *.wikibooks.org self.remove_nodes_re += "|status-icon" # www.wikidata.org self.remove_nodes_re += "|wikibase-edittoolbar-container" # http://www.dailymail.co.uk/news/article-2742786/Complacent-Home-Office-loses-175-000-illegal-immigrants-Fresh-humiliation-officials-admit-went-missing-refused-permission-stay.html self.remove_nodes_re += "|most-read-news-wrapper|most-watched-videos-wrapper" self.regexp_namespace = "http://exslt.org/regular-expressions" self.nauthy_ids_re = "//*[re:test(@id, '%s', 'i')]" % self.remove_nodes_re self.nauthy_classes_re = "//*[re:test(@class, '%s', 'i')]" % self.remove_nodes_re self.nauthy_names_re = "//*[re:test(@name, '%s', 'i')]" % self.remove_nodes_re self.nauthy_tags = ["noscript"] self.google_re = " google " self.entries_re = "^[^entry-]more.*$" self.facebook_re = "[^-]facebook" self.facebook_braodcasting_re = "facebook-broadcasting" self.twitter_re = "[^-]twitter" self.tablines_replacements = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") def set_known_host_remove_selectors(self): self.known_host_remove_selectors = HostUtils.host_selectors(_Const().get_known_host_remove_selectors, self.article.domain) def clean(self): doc_to_clean = self.article.doc doc_to_clean = self.remove_scripts_styles(doc_to_clean) self.set_known_host_remove_selectors() if self.known_host_remove_selectors: return self.remove_host_specific_nodes(doc_to_clean) doc_to_clean = self.clean_body_classes(doc_to_clean) doc_to_clean = self.clean_article_tags(doc_to_clean) doc_to_clean = self.remove_drop_caps(doc_to_clean) doc_to_clean = self.clean_bad_tags(doc_to_clean) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.google_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.entries_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.facebook_braodcasting_re) doc_to_clean = self.remove_nodes_regex(doc_to_clean, self.twitter_re) doc_to_clean = self.clean_para_spans(doc_to_clean) doc_to_clean = self.div_to_para(doc_to_clean, 'div') doc_to_clean = self.div_to_para(doc_to_clean, 'span') return doc_to_clean def clean_body_classes(self, doc): # we don't need body classes # in case it matches an unwanted class all the document # will be empty elements = self.parser.getElementsByTag(doc, tag="body") if elements: self.parser.delAttribute(elements[0], attr="class") return doc def clean_article_tags(self, doc): articles = self.parser.getElementsByTag(doc, tag='article') for article in articles: for attr in ['id', 'name', 'class']: self.parser.delAttribute(article, attr=attr) return doc def remove_drop_caps(self, doc): items = self.parser.css_select(doc, "span[class~=dropcap], span[class~=drop_cap]") for item in items: self.parser.drop_tag(item) return doc def remove_scripts_styles(self, doc): # remove scripts scripts = self.parser.getElementsByTag(doc, tag='script') for item in scripts: self.parser.remove(item) # remove styles styles = self.parser.getElementsByTag(doc, tag='style') for item in styles: self.parser.remove(item) # remove comments comments = self.parser.getComments(doc) for item in comments: self.parser.remove(item) return doc def clean_bad_tags(self, doc): # ids naughty_list = self.parser.xpath_re(doc, self.nauthy_ids_re) for node in naughty_list: self.parser.remove(node) # class naughty_classes = self.parser.xpath_re(doc, self.nauthy_classes_re) for node in naughty_classes: self.parser.remove(node) # name naughty_names = self.parser.xpath_re(doc, self.nauthy_names_re) for node in naughty_names: self.parser.remove(node) for nauthy_tag in self.nauthy_tags: nodes = self.parser.getElementsByTag(doc, tag=nauthy_tag) for node in nodes: images = self.parser.getElementsByTag(node, tag='img') if images: parent = node.getparent() parent_index = parent.index(node) for image in images: parent.insert(parent_index, image) else: self.parser.remove(node) return doc def remove_host_specific_nodes(self, doc): nodes = self.parser.css_select(doc, self.known_host_remove_selectors) for node in nodes: self.parser.remove(node) return doc def remove_nodes_regex(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughty_list = self.parser.xpath_re(doc, reg) for node in naughty_list: self.parser.remove(node) return doc def clean_para_spans(self, doc): spans = self.parser.css_select(doc, 'p span') for item in spans: self.parser.drop_tag(item) return doc def get_flushed_buffer(self, replacement_text, doc): return self.parser.textToPara(replacement_text) def get_replacement_nodes(self, doc, div): replacement_text = [] nodes_to_return = [] nodes_to_remove = [] childs = self.parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if self.parser.getTag(kid) == 'p' and len(replacement_text) > 0: newNode = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(newNode) replacement_text = [] nodes_to_return.append(kid) # node is a text node elif self.parser.isTextNode(kid): kid_text_node = kid kid_text = self.parser.getText(kid) replace_text = self.tablines_replacements.replaceAll(kid_text) if(len(replace_text)) > 1: previous_sibling_node = self.parser.previousSibling(kid_text_node) while previous_sibling_node is not None \ and self.parser.getTag(previous_sibling_node) == "a" \ and self.parser.getAttribute(previous_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(previous_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(previous_sibling_node) self.parser.setAttribute(previous_sibling_node, attr='grv-usedalready', value='yes') prev = self.parser.previousSibling(previous_sibling_node) previous_sibling_node = prev if prev is not None else None next_sibling_node = self.parser.nextSibling(kid_text_node) while next_sibling_node is not None \ and self.parser.getTag(next_sibling_node) == "a" \ and self.parser.getAttribute(next_sibling_node, 'grv-usedalready') != 'yes': outer = " " + self.parser.outerHtml(next_sibling_node) + " " replacement_text.append(outer) nodes_to_remove.append(next_sibling_node) self.parser.setAttribute(next_sibling_node, attr='grv-usedalready', value='yes') next = self.parser.nextSibling(next_sibling_node) previous_sibling_node = next if next is not None else None # otherwise else: nodes_to_return.append(kid) # flush out anything still remaining if(len(replacement_text) > 0): new_node = self.get_flushed_buffer(''.join(replacement_text), doc) nodes_to_return.append(new_node) replacement_text = [] for n in nodes_to_remove: self.parser.remove(n) return nodes_to_return def replace_with_para(self, doc, div): self.parser.replaceTag(div, 'p') def div_to_para(self, doc, dom_type): bad_divs = 0 else_divs = 0 divs = self.parser.getElementsByTag(doc, tag=dom_type) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = self.parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replace_with_para(doc, div) bad_divs += 1 elif div is not None: replaceNodes = self.get_replacement_nodes(doc, div) for child in self.parser.childNodes(div): div.remove(child) for c, n in enumerate(replaceNodes): div.insert(c, n) else_divs += 1 return doc
class DocumentCleaner(object): def __init__(self): self.regExRemoveNodes = ( "^side$|combx|retweet|fontresize|mediaarticlerelated|menucontainer|navbar" "|comment|PopularQuestions|foot|footer|Footer|footnote" "|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor" "|tags|socialnetworking|socialNetworking|cnnStryHghLght" "|cnn_stryspcvbx|^inset$|pagetools|post-attributes" "|welcome_form|contentTools2|the_answers|rating" "|communitypromo|runaroundLeft|subscribe|vcard|articleheadings|articlead" "|date|^print$|popup|author-dropdown|tools|socialtools" "|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text" "|source|legende|ajoutVideo|timestamp|menu" ) self.regExNotRemoveNodes = ("and|no|article|body|column|main|shadow") self.regexpNS = "http://exslt.org/regular-expressions" self.divToPElementsPattern = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)" self.captionPattern = "^caption$" self.googlePattern = " google " self.entriesPattern = "^[^entry-]more.*$" self.facebookPattern = "[^-]facebook" self.facebookBroadcastingPattern = "facebook-broadcasting" self.twitterPattern = "[^-]twitter" self.tabsAndNewLinesReplcesments = ReplaceSequence()\ .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") self.todel = self.regExRemoveNodes.lower().split('|') self.notdel = self.regExNotRemoveNodes.lower().split('|') def clean(self, article): docToClean = article.doc nodelist = self.getNodesToDelete(docToClean) for node in nodelist: Parser.remove(node) docToClean = self.removeListsWithLinks(docToClean) docToClean = self.dropTags(docToClean,['em','strong']) docToClean = self.removeDropCaps(docToClean) docToClean = self.removeNodesViaRegEx(docToClean, self.captionPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.googlePattern) docToClean = self.removeNodesViaRegEx(docToClean, self.entriesPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.facebookBroadcastingPattern) docToClean = self.removeNodesViaRegEx(docToClean, self.twitterPattern) docToClean = self.cleanUpSpanTagsInParagraphs(docToClean) docToClean = self.keepLineBreaks(docToClean) docToClean = self.convertDivsToParagraphs(docToClean, 'div') docToClean = self.convertDivsToParagraphs(docToClean, 'span') return docToClean def getNodesToDelete(self, doc): nodelist = [] for node in doc: if node.tag in ['script','noscript','style','option'] or isinstance(node,lxml.html.HtmlComment): nodelist.append(node) continue if node.tag in ['p','span','b','h1','h2','h3','h4','h5'] and len(node) == 0: continue; ids = '' if node.attrib.has_key('class'): ids += ' ' + node.attrib['class'].lower() if node.attrib.has_key('id'): ids += ' ' + node.attrib['id'].lower() if node.attrib.has_key('name'): ids += ' ' + node.attrib['name'].lower() good_word = '' for word in self.notdel: if ids.find(word) >= 0: good_word = word continue bad_word = '' for word in self.todel: if ids.find(word) >= 0: bad_word = word break if (bad_word != '' and good_word == '') or (bad_word != '' and bad_word.find(good_word) >= 0): nodelist.append(node) continue nodelist += self.getNodesToDelete(node) return nodelist def keepLineBreaks(self, doc): items=Parser.getElementsByTag(doc, tag='br') for n in items: if n.tail is not None: n.tail = u'\ufffc ' + n.tail else: n.tail = u'\ufffc' n.drop_tag() items=Parser.getElementsByTag(doc, tag='p') for n in items: if n.tail is not None: n.tail = u'\ufffc ' + n.tail else: n.tail = u'\ufffc' # if n.text is None: n.drop_tag() # drop empty p return doc def removeWrapedLinks(self, e): if e is None or len(e) != 1 or e[0].tag != 'a': return [] text = '' if e.text is not None: text += e.text if e[0].tail is not None: text += e[0].tail if e.tail is not None: text += e.tail if re.search('[^ \t\r\n]',text): return [] toRemove = [e] + self.removeWrapedLinks(Parser.nextSibling(e)) return toRemove def removeListsWithLinks(self, doc): for tag in ['ol','ul']: items=Parser.getElementsByTag(doc, tag=tag) for item in items: fa = 0 for li in item: if Parser.getElementsByTag(li, tag='a'): fa += 1 if fa > 2: parent = item.getparent() Parser.remove(item) if parent is not None: if len(parent) == 0 or len(Parser.getText(parent).split()) < 4: Parser.remove(parent) break else: fa = 0 items=Parser.getElementsByTag(doc, tag='a') for a in items: e = a.getparent() if e is None: continue text = Parser.getText(e) ldels = [] textcount = 0 for link in e: ltext = Parser.getText(link) if link.tag != 'a' and len(ltext) <= 2: continue if link.tag != 'a' and len(ltext) > 2: ldels = [] break if ltext == '': continue ldel = text.split(ltext,1) ld = ldel[0].strip() ldels.append(ld) if len(ldel) == 1: break text = ldel[1] if len(ldels) == 0 or ldels[0] == ',': continue else: del ldels[0] flag = 0; flag1 = 0; flag2 = 0; flag3 = 0 for ldel in ldels: if ldel == ldels[0]: flag += 1 if len(ldel) > 3 or ldel.find(',') >= 0: flag1 = 1 if ldel != '': flag2 = 1 if len(ldel) > 1: flag3 = 1 if flag2 == 0 and len(ldels) > 1: Parser.remove(e) continue if len(ldels) == 2 and ldels[0] == '|' and ldels[1] == '|': Parser.remove(e) continue if len(ldels) > 3 and flag3 == 0: Parser.remove(e) continue if flag <= 2 and (len(ldels) <= 2 or flag1 != 0): continue Parser.remove(e) return doc items=Parser.getElementsByTag(doc, tag='a') for a in items: e = a.getparent() if e is None: continue if len(e) == 1: toRemove = self.removeWrapedLinks(e) if len(toRemove) > 2: for bn in toRemove: Parser.remove(bn) return doc def dropTags(self, doc, tags): for tag in tags: ems = Parser.getElementsByTag(doc, tag=tag) for node in ems: images = Parser.getElementsByTag(node, tag='img') if len(images) == 0: node.drop_tag() return doc def removeDropCaps(self, doc): items = doc.cssselect("span[class~=dropcap], span[class~=drop_cap]") for item in items: item.drop_tag() return doc def removeNodesViaRegEx(self, doc, pattern): for selector in ['id', 'class']: reg = "//*[re:test(@%s, '%s', 'i')]" % (selector, pattern) naughtyList = doc.xpath(reg, namespaces={'re': self.regexpNS}) for node in naughtyList: Parser.remove(node) return doc def cleanUpSpanTagsInParagraphs(self, doc): spans = doc.cssselect('p > span') for item in spans: item.drop_tag() return doc def getFlushedBuffer(self, replacementText, doc): return Parser.textToPara(replacementText) def getReplacementNodes(self, doc, div): replacementText = [] nodesToReturn = [] nodesToRemove = [] childs = Parser.childNodesWithText(div) for kid in childs: # node is a p # and already have some replacement text if Parser.getTag(kid) == 'p' and len(replacementText) > 0: newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # node is a text node elif Parser.isTextNode(kid): kidTextNode = kid kidText = Parser.getText(kid) replaceText = self.tabsAndNewLinesReplcesments.replaceAll(kidText) if(len(replaceText)) > 0: prevSibNode = Parser.previousSibling(kidTextNode) while prevSibNode is not None \ and Parser.getTag(prevSibNode) == "a" \ and Parser.getAttribute(prevSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(prevSibNode) + " " replacementText.append(outer) nodesToRemove.append(prevSibNode) Parser.setAttribute(prevSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.previousSibling(prevSibNode) # append replaceText replacementText.append(replaceText) # nextSibNode = Parser.nextSibling(kidTextNode) while nextSibNode is not None \ and Parser.getTag(nextSibNode) == "a" \ and Parser.getAttribute(nextSibNode, 'grv-usedalready') != 'yes': outer = " " + Parser.outerHtml(nextSibNode) + " " replacementText.append(outer) nodesToRemove.append(nextSibNode) Parser.setAttribute(nextSibNode, attr='grv-usedalready', value='yes') prevSibNode = Parser.nextSibling(nextSibNode) # otherwise else: if Parser.getTag(kid) == "a" and Parser.getAttribute(kid, 'grv-usedalready') == 'yes': continue if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] nodesToReturn.append(kid) # flush out anything still remaining if(len(replacementText) > 0): newNode = self.getFlushedBuffer(''.join(replacementText), doc) nodesToReturn.append(newNode) replacementText = [] for n in nodesToRemove: Parser.remove(n) return nodesToReturn def replaceElementsWithPara(self, doc, div): Parser.replaceTag(div, 'p') def convertDivsToParagraphs(self, doc, domType): badDivs = 0 elseDivs = 0 divs = Parser.getElementsByTag(doc, tag=domType) tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre', 'table', 'ul'] for div in divs: items = Parser.getElementsByTags(div, tags) if div is not None and len(items) == 0: self.replaceElementsWithPara(doc, div) badDivs += 1 elif div is not None: replaceNodes = self.getReplacementNodes(doc, div) text = div.tail div.clear() for c, n in enumerate(replaceNodes): div.insert(c, n) div.tail = text elseDivs += 1 return doc