def Process(root_tag, url, hit_counter=None): """Process an entire soup, without recursing into stripped nodes.""" # Make a single "class and id" attribute that everything else can test. root_tag['classid'] = '!!!'.join([ _SeparateWords(root_tag.get('class', '')).strip(), _SeparateWords(root_tag.get('id', '')).strip() ]).strip('!') top_run = False if hit_counter is None: hit_counter = {} top_run = True _Score(root_tag, url, hit_counter) if _Strip(root_tag): return for tag in root_tag.findAll(True, recursive=False): Process(tag, url, hit_counter) # Look for too-frequently-matched false-positive patterns. if top_run: for key, tags in hit_counter.iteritems(): if len(tags) >= FALSE_POSITIVE_THRESHOLD: points, attr, unused_pattern = key if points < 0: # Only reverse false _positives_. Negatives probably aren't false. continue logging.info( 'Undoing %d points for %d tags, with %s matching %s', points, len(tags), attr, unused_pattern) for tag in tags: util.ApplyScore(tag, -1 * points, name=attr)
def _ScoreImages(root_tag): """Score up images.""" for tag in root_tag.findAll('img'): util.ApplyScore(tag, 1.5, name='any_img') if tag.has_key('alt') and len(tag['alt']) > 50: util.ApplyScore(tag, 2, name='img_alt') size = _TagSize(tag) if size is None: continue if size <= 625: util.ApplyScore(tag, -1.5, name='tiny_img') if size >= 50000: util.ApplyScore(tag, 3, name='has_img') if size >= 250000: util.ApplyScore(tag, 4, name='big_img')
def _ScoreBlocks(root_tag): """Score up all leaf block nodes, based on the length of their text.""" for leaf_block in _FindLeafBlocks(root_tag): # Length of stripped text, with all whitespace collapsed. text_len = _TextLenNonAnchors(leaf_block) if text_len == 0: anchor = leaf_block.find('a') img = leaf_block.find('img') if anchor and not anchor.has_key('score_out_link') and not img: util.ApplyScore(leaf_block, -2, name='only_anchor') continue if text_len < 20: util.ApplyScore(leaf_block, -0.75, name='short_text') if text_len > 50: util.ApplyScore(leaf_block, 3, name='some_text') if text_len > 250: util.ApplyScore(leaf_block, 4, name='more_text')
def _Score(tag, url, hit_counter): if tag.name == 'body': return for points, attr, pattern in ATTR_POINTS: if not tag.has_key(attr): continue if pattern.search(tag[attr]): util.ApplyScore(tag, points, name=attr) key = (points, attr, pattern.pattern) hit_counter.setdefault(key, []) hit_counter[key].append(tag) if tag.name == 'a' and tag.has_key('href'): that_url = urlparse.urljoin(url, tag['href']) if url in that_url or url in urllib.unquote(tag['href']): # Special case: score down AND strip links to this page. (Including # "social media" links.) util.ApplyScore(tag, -1.5, name='self_link') tag.extract() # TODO: host name -> domain name elif urlparse.urlparse(url)[1] != urlparse.urlparse(that_url)[1]: # Score up links to _other_ domains. util.ApplyScore(tag, 1.0, name='out_link')
def _Score(tag, url, hit_counter): if tag.name == 'body': return # Point patterns. for points, attr, pattern in ATTR_POINTS: if not tag.has_key(attr): continue if pattern.search(tag[attr]): parent_match = tag.parent and tag.parent.has_key(attr) and ( pattern.search(tag.parent[attr])) if not parent_match: util.ApplyScore(tag, points, name=attr) key = (points, attr, pattern.pattern) hit_counter.setdefault(key, []) hit_counter[key].append(tag) # Links. if tag.name == 'a' and tag.has_key('href') and not tag.has_key('score_href'): that_url = urlparse.urljoin(url, tag['href']) if url == that_url or url == urllib.unquote(tag['href']): # Special case: score down AND strip links to this page. (Including # "social media" links.) util.ApplyScore(tag, -1.5, name='self_link') util.Strip(tag) # TODO: host name -> domain name elif urlparse.urlparse(url)[1] != urlparse.urlparse(that_url)[1]: # Score up links to _other_ domains. util.ApplyScore(tag, 1.0, name='out_link') # Blocks. if _IsLeafBlock(tag): # Length of stripped text, with all whitespace collapsed. text_len = _TextLen(tag) if text_len == 0: anchor = tag.find('a') img = tag.find('img') if anchor and not anchor.has_key('score_out_link') and not img: util.ApplyScore(tag, -2, name='only_anchor') else: if text_len < 20: util.ApplyScore(tag, -0.75, name='short_text') if text_len > 50: util.ApplyScore(tag, 3, name='some_text') if text_len > 250: util.ApplyScore(tag, 4, name='more_text') # Images. if tag.name == 'img': util.ApplyScore(tag, 1.5, name='any_img') if tag.has_key('alt') and len(tag['alt']) > 50: util.ApplyScore(tag, 2, name='img_alt') size = _TagSize(tag) if size is not None: if size <= 625: util.ApplyScore(tag, -1.5, name='tiny_img') if size >= 50000: util.ApplyScore(tag, 3, name='has_img') if size >= 250000: util.ApplyScore(tag, 4, name='big_img') # Embeds. if tag.name in util.EMBED_NAMES or ( tag.name == 'iframe' and tag.has_key('src') and ( 'youtube.com' in tag['src'] or 'youtube-nocookie.com' in tag['src'] or 'vimeo.com' in tag['src'] )): size = _TagSize(tag) if size > 10000: util.ApplyScore(tag, 15, name='has_embed')
def _SiteSpecific(url, root_tag): if 'www.cracked.com' in url: tag = root_tag.find(attrs={'class': 'Column2'}) if tag: tag.extract() tag = root_tag.find(attrs={'class': 'userStyled'}) if tag: util.ApplyScore(tag, 20, name='special')
def _ScoreEmbeds(root_tag): """Score up objects/embeds.""" for tag in util.FindEmbeds(root_tag): size = _TagSize(tag) if size > 10000: util.ApplyScore(tag, 15, name='has_embed')
return soup, u'' title = soup.find('title') title = title and title.text.lower() or '' _TransformBrsToParagraphs(soup) patterns.Process(soup, url) _ScoreBlocks(soup) _ScoreImages(soup) _ScoreEmbeds(soup) _SiteSpecific(url, soup) # If a header repeats the title, strip it and all preceding nodes. title_header = _FindTitleHeader(soup, title) if title_header: util.ApplyScore(title_header, 11, name='title_header') if 'flickr.com' not in url: _StripBefore(title_header) # Get the highest scored nodes. scored_nodes = sorted(soup.findAll(attrs={'score': True}), key=lambda x: x['score'])[-15:] if not scored_nodes: return soup, u'<p>Scoring error.</p>' best_node = scored_nodes[-1] _TransformDivsToPs(best_node) _StripLowScored(best_node) # For debugging ... if util.IS_DEV_APPSERVER: