def get_body(doc): [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] raw_html = unicode(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned except Exception: #FIXME find the equivalent lxml error #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html
def sanitize(node, candidates, options): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if class_weight(header) < 0 or get_link_density(header) > 0.33: header.drop_tree() for elem in tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: logging.debug("Cleaned %s with score %6.3f and weight %-3s" % (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' %kind)) counts["li"] -= 100 content_length = text_length(el) # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True # if el.tag == 'div' and counts['img'] >= 1 and to_remove: # imgs = el.findall('.//img') # valid_img = False # logging.debug(tounicode(el)) # for img in imgs: # height = img.get('height') # text_length = img.get('text_length') # logging.debug ("height %s text_length %s" %(repr(height), repr(text_length))) # if to_int(height) >= 100 or to_int(text_length) >= 100: # valid_img = True # logging.debug("valid image" + tounicode(img)) # break # if valid_img: # to_remove = False # logging.debug("Allowing %s" %el.text_content()) # for desnode in tags(el, "table", "ul", "div"): # allowed[desnode] = True #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i =+ 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j =+ 1 siblings.append(sib_content_length) if j == x: break #logging.debug(str(siblings)) if siblings and sum(siblings) > 1000 : to_remove = False logging.debug("Allowing %s" % describe(el)) for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #logging.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() # for el in ([node] + [n for n in node.iter()]): # if not (self.options['attributes']): # #el.attrib = {} #FIXME:Checkout the effects of disabling this # pass return clean_attributes(tounicode(node))
def get_clean_html(self): return clean_attributes(tounicode(self.html))
def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() for elem in self.tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in self.reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = self.class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: self.debug("Cleaned %s with score %6.3f and weight %-3s" % (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 # Count the text length excluding any surrounding whitespace content_length = text_length(el) link_density = self.get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 to_remove = False reason = "" if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i =+ 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j =+ 1 siblings.append(sib_content_length) if j == x: break #self.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) for desnode in self.tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) el.drop_tree() for el in ([node] + [n for n in node.iter()]): if not self.options.get('attributes', None): #el.attrib = {} #FIXME:Checkout the effects of disabling this pass return clean_attributes(tounicode(node))
def get_clean_html(self): return clean_attributes(tounicode(self.html, method='html'))
def get_clean_article(self): """ Returns a string version of the html with attributes removed. """ return clean_attributes(tounicode(self.article))
def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density( header) > 0.33: header.drop_tree() for elem in self.tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in self.reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = self.class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: self.debug("Cleaned %s with score %6.3f and weight %-3s" % ( describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 # Count the text length excluding any surrounding whitespace content_length = text_length(el) link_density = self.get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node][ 'content_score'] else: content_score = 0 to_remove = False reason = "" if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i = +1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j = +1 siblings.append(sib_content_length) if j == x: break #self.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) for desnode in self.tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: self.debug( "Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) el.drop_tree() for el in ([node] + [n for n in node.iter()]): if not self.options.get('attributes', None): #el.attrib = {} #FIXME:Checkout the effects of disabling this pass return clean_attributes(tounicode(node))
def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): class_weight = self.class_weight(header) link_density = self.get_link_density(header) if class_weight < 0 or link_density > 0.33: header.drop_tree() for elem in self.tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in self.reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = self.class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: self.debug("Cleaned %s with score %6.3f and weight %-3s" % (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 # Count the text length excluding any surrounding whitespace content_length = text_length(el) link_density = self.get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: parent = candidates[parent_node] content_score = parent['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = self.class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): reason = ('too short content length %s without a single' ' image') % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = ('<embed>s with too short content length, or too' ' many <embed>s') to_remove = True # don't really understand what this is doing. Originally # the i/j were =+ which sets the value to 1. I think that # was supposed to be += which would increment. But then # it's compared to x which is hard set to 1. So you only # ever do one loop in each iteration and don't understand # it. Will have to investigate when we get to testing more # pages. i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i += 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j += 1 siblings.append(sib_content_length) if j == x: break #self.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) for desnode in self.tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: self.debug( "Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() for el in ([node] + [n for n in node.iter()]): if not self.options.get('attributes', None): #el.attrib = {} #FIXME:Checkout the effects of disabling this pass return clean_attributes(tounicode(node))