def get_article(doc, options): try: ruthless = True while True: for i in tags(doc, 'script', 'style'): i.drop_tree() for i in tags(doc, 'body'): i.set('id', 'readabilityBody') if ruthless: remove_unlikely_candidates(doc) transform_double_breaks_into_paragraphs(doc) transform_misused_divs_into_paragraphs(doc) candidates = score_paragraphs(doc, options) best_candidate = select_best_candidate(candidates) if best_candidate: confidence = best_candidate['content_score'] article = get_raw_article(candidates, best_candidate) else: if ruthless: logging.debug("ruthless removal did not work. ") ruthless = False logging.debug( "ended up stripping too much - going for a safer parse" ) # try again continue else: logging.debug( "Ruthless and lenient parsing did not work. Returning raw html" ) return Summary(0, None) unicode_cleaned_article = sanitize(article, candidates, options) cleaned_doc = fragment_fromstring(unicode_cleaned_article) cleaned_article = tostring(cleaned_doc) of_acceptable_length = len(cleaned_article or '') >= options['retry_length'] if ruthless and not of_acceptable_length: ruthless = False continue # try again else: return Summary(confidence, cleaned_article) except StandardError as e: #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info()))) logging.exception('error getting summary: ') raise Unparseable(str(e)), None, sys.exc_info()[2]
def transform_misused_divs_into_paragraphs(doc): for elem in tags(doc, 'div'): # transform <div>s that do not contain other block elements into <p>s if not REGEXES['divToPElementsRe'].search( unicode(''.join(map(tostring, list(elem))))): logging.debug("Altering %s to p" % (describe(elem))) elem.tag = "p"
def find_next_page_url(parsed_urls, url, elem): links = tags(elem, 'a') base_url = find_base_url(url) # candidates is a mapping from URLs to NextPageCandidate objects that # represent information used to determine if a URL points to the next page # in the article. candidates = {} for link in links: logging.debug('link: %s' % tostring(link)) eval_possible_next_page_link( parsed_urls, url, base_url, candidates, link ) top_candidate = None for url, candidate in candidates.items(): score = candidate.score logging.debug('next page score of %s: %s' % (url, candidate.score)) if 50 <= score and (not top_candidate or top_candidate.score < score): top_candidate = candidate if top_candidate: logging.debug('next page link found: %s' % top_candidate.href) parsed_urls.add(top_candidate.href) return top_candidate.href else: return None
def get_article(doc, options): try: ruthless = True while True: for i in tags(doc, 'script', 'style'): i.drop_tree() for i in tags(doc, 'body'): i.set('id', 'readabilityBody') if ruthless: remove_unlikely_candidates(doc) transform_double_breaks_into_paragraphs(doc) transform_misused_divs_into_paragraphs(doc) candidates = score_paragraphs(doc, options) best_candidate = select_best_candidate(candidates) if best_candidate: confidence = best_candidate['content_score'] article = get_raw_article(candidates, best_candidate) else: if ruthless: logging.debug("ruthless removal did not work. ") ruthless = False logging.debug("ended up stripping too much - going for a safer parse") # try again continue else: logging.debug("Ruthless and lenient parsing did not work. Returning raw html") return Summary(0, None) unicode_cleaned_article = sanitize(article, candidates, options) cleaned_doc = fragment_fromstring(unicode_cleaned_article) cleaned_article = tounicode(cleaned_doc) of_acceptable_length = len(cleaned_article or '') >= options['retry_length'] if ruthless and not of_acceptable_length: ruthless = False continue # try again else: return Summary(confidence, cleaned_article) except StandardError as e: #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info()))) logging.exception('error getting summary: ' ) raise Unparseable(str(e)), None, sys.exc_info()[2]
def transform_double_breaks_into_paragraphs(doc): ''' Modifies doc so that double-breaks (<br><br>) in content delineate paragraphs. Some pages use double-breaks when they really should be using paragraphs: <div> Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent in justo sapien, a consectetur est. Aliquam iaculis, augue eu euismod gravida, nisl nisl posuere odio, at euismod metus enim quis nibh. <br><br> Praesent posuere tortor at nunc iaculis eget suscipit tellus tempus. Nulla facilisi. Quisque rutrum, ante eu sollicitudin congue, dui sapien egestas arcu, in consequat nisl metus eu sem. <br><br> Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet justo. Nullam rutrum sodales magna vel vestibulum. Curabitur sit amet urna purus, ac aliquet sem. </div> This routine would transform this into: <div> <p> Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent in justo sapien, a consectetur est. Aliquam iaculis, augue eu euismod gravida, nisl nisl posuere odio, at euismod metus enim quis nibh. </p> <p> Praesent posuere tortor at nunc iaculis eget suscipit tellus tempus. Nulla facilisi. Quisque rutrum, ante eu sollicitudin congue, dui sapien egestas arcu, in consequat nisl metus eu sem. </p> <p> Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet justo. Nullam rutrum sodales magna vel vestibulum. Curabitur sit amet urna purus, ac aliquet sem. </p> </div> ''' for div in tags(doc, 'div'): transform_double_breaks_into_paragraphs_elem(div)
def score_paragraphs(doc, options): candidates = {} #logging.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): logging.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() inner_text = clean(elem.text_content() or "") inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. if inner_text_len < options['min_text_len']: continue if parent_node not in candidates: candidates[parent_node] = score_node(parent_node) ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: candidates[grand_parent_node] = score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) #if elem not in candidates: # candidates[elem] = score_node(elem) #WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: candidates[grand_parent_node][ 'content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. for elem in ordered: candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score * (1 - ld))) candidate['content_score'] *= (1 - ld) return candidates
def score_paragraphs(doc, options): candidates = {} #logging.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): logging.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() inner_text = clean(elem.text_content() or "") inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. if inner_text_len < options['min_text_len']: continue if parent_node not in candidates: candidates[parent_node] = score_node(parent_node) ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: candidates[grand_parent_node] = score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) #if elem not in candidates: # candidates[elem] = score_node(elem) #WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: candidates[grand_parent_node]['content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. for elem in ordered: candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld))) candidate['content_score'] *= (1 - ld) return candidates
def find_next_page_url(parsed_urls, url, elem): links = tags(elem, 'a') base_url = find_base_url(url) # candidates is a mapping from URLs to NextPageCandidate objects that # represent information used to determine if a URL points to the next page # in the article. candidates = {} for link in links: logging.debug('link: %s' % tostring(link)) eval_possible_next_page_link(parsed_urls, url, base_url, candidates, link) top_candidate = None for url, candidate in candidates.items(): score = candidate.score logging.debug('next page score of %s: %s' % (url, candidate.score)) if 50 <= score and (not top_candidate or top_candidate.score < score): top_candidate = candidate if top_candidate: logging.debug('next page link found: %s' % top_candidate.href) parsed_urls.add(top_candidate.href) return top_candidate.href else: return None
def sanitize(node, candidates, options): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if class_weight(header) < 0 or get_link_density(header) > 0.33: header.drop_tree() for elem in tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: logging.debug("Cleaned %s with score %6.3f and weight %-3s" % (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' %kind)) counts["li"] -= 100 content_length = text_length(el) # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % (link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True # if el.tag == 'div' and counts['img'] >= 1 and to_remove: # imgs = el.findall('.//img') # valid_img = False # logging.debug(tounicode(el)) # for img in imgs: # height = img.get('height') # text_length = img.get('text_length') # logging.debug ("height %s text_length %s" %(repr(height), repr(text_length))) # if to_int(height) >= 100 or to_int(text_length) >= 100: # valid_img = True # logging.debug("valid image" + tounicode(img)) # break # if valid_img: # to_remove = False # logging.debug("Allowing %s" %el.text_content()) # for desnode in tags(el, "table", "ul", "div"): # allowed[desnode] = True #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i =+ 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j =+ 1 siblings.append(sib_content_length) if j == x: break #logging.debug(str(siblings)) if siblings and sum(siblings) > 1000 : to_remove = False logging.debug("Allowing %s" % describe(el)) for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #logging.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() # for el in ([node] + [n for n in node.iter()]): # if not (self.options['attributes']): # #el.attrib = {} #FIXME:Checkout the effects of disabling this # pass return clean_attributes(tounicode(node))
def transform_misused_divs_into_paragraphs(doc): for elem in tags(doc, 'div'): # transform <div>s that do not contain other block elements into <p>s if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))): logging.debug("Altering %s to p" % (describe(elem))) elem.tag = "p"
def sanitize(node, candidates, options): for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if class_weight(header) < 0 or get_link_density(header) > 0.33: header.drop_tree() for elem in tags(node, "form", "iframe", "textarea"): elem.drop_tree() allowed = {} # Conditionally clean <table>s, <ul>s, and <div>s for el in reverse_tags(node, "table", "ul", "div"): if el in allowed: continue weight = class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] #print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: logging.debug("Cleaned %s with score %6.3f and weight %-3s" % ( describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: counts[kind] = len(el.findall('.//%s' % kind)) counts["li"] -= 100 content_length = text_length( el ) # Count the text length excluding any surrounding whitespace link_density = get_link_density(el) parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: content_score = candidates[parent_node]['content_score'] else: content_score = 0 #if parent_node is not None: #pweight = class_weight(parent_node) + content_score #pname = describe(parent_node) #else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" #if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": reason = "more <li>s than <p>s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x <p>s than <input>s" to_remove = True elif content_length < options['min_text_length'] and ( counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "<embed>s with too short content length, or too many <embed>s" to_remove = True # if el.tag == 'div' and counts['img'] >= 1 and to_remove: # imgs = el.findall('.//img') # valid_img = False # logging.debug(tounicode(el)) # for img in imgs: # height = img.get('height') # text_length = img.get('text_length') # logging.debug ("height %s text_length %s" %(repr(height), repr(text_length))) # if to_int(height) >= 100 or to_int(text_length) >= 100: # valid_img = True # logging.debug("valid image" + tounicode(img)) # break # if valid_img: # to_remove = False # logging.debug("Allowing %s" %el.text_content()) # for desnode in tags(el, "table", "ul", "div"): # allowed[desnode] = True #find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: i = +1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): #logging.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: j = +1 siblings.append(sib_content_length) if j == x: break #logging.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False logging.debug("Allowing %s" % describe(el)) for desnode in tags(el, "table", "ul", "div"): allowed[desnode] = True if to_remove: logging.debug( "Cleaned %6.3f %s with weight %s cause it has %s." % (content_score, describe(el), weight, reason)) #print tounicode(el) #logging.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() # for el in ([node] + [n for n in node.iter()]): # if not (self.options['attributes']): # #el.attrib = {} #FIXME:Checkout the effects of disabling this # pass return clean_attributes(tounicode(node))