def score_paragraphs(doc, options): candidates = {} #logging.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): logging.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() inner_text = clean(elem.text_content() or "") inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. if inner_text_len < options['min_text_len']: continue if parent_node not in candidates: candidates[parent_node] = score_node(parent_node) ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: candidates[grand_parent_node] = score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) #if elem not in candidates: # candidates[elem] = score_node(elem) #WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: candidates[grand_parent_node][ 'content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. for elem in ordered: candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score * (1 - ld))) candidate['content_score'] *= (1 - ld) return candidates
def score_paragraphs(doc, options): candidates = {} #logging.debug(str([describe(node) for node in tags(doc, "div")])) ordered = [] for elem in tags(doc, "p", "pre", "td"): logging.debug('Scoring %s' % describe(elem)) parent_node = elem.getparent() if parent_node is None: continue grand_parent_node = parent_node.getparent() inner_text = clean(elem.text_content() or "") inner_text_len = len(inner_text) # If this paragraph is less than 25 characters, don't even count it. if inner_text_len < options['min_text_len']: continue if parent_node not in candidates: candidates[parent_node] = score_node(parent_node) ordered.append(parent_node) if grand_parent_node is not None and grand_parent_node not in candidates: candidates[grand_parent_node] = score_node(grand_parent_node) ordered.append(grand_parent_node) content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) #if elem not in candidates: # candidates[elem] = score_node(elem) #WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: candidates[grand_parent_node]['content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content should have a # relatively small link density (5% or less) and be mostly unaffected by this operation. for elem in ordered: candidate = candidates[elem] ld = get_link_density(elem) score = candidate['content_score'] logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld))) candidate['content_score'] *= (1 - ld) return candidates
def text_length(i): return len(clean(i.text_content() or ""))
def eval_link_text(link): link_text = clean(link.text_content() or '') if REGEXES['extraneous'].search(link_text) or len(link_text) > 25: return link_text, False else: return link_text, True