Beispiel #1
0
def Process(root_tag, url, hit_counter=None):
  """Process an entire soup, without recursing into stripped nodes."""
  # Make a single "class and id" attribute that everything else can test.
  root_tag['classid'] = '!!!'.join([
      _SeparateWords(root_tag.get('class', '')).strip(),
      _SeparateWords(root_tag.get('id', '')).strip()
      ]).strip('!')

  top_run = False
  if hit_counter is None:
    hit_counter = {}
    top_run = True

  _Score(root_tag, url, hit_counter)
  if _Strip(root_tag): return
  for tag in root_tag.findAll(True, recursive=False):
    Process(tag, url, hit_counter)

  # Look for too-frequently-matched false-positive patterns.
  if top_run:
    for key, tags in hit_counter.iteritems():
      if len(tags) >= FALSE_POSITIVE_THRESHOLD:
        points, attr, unused_pattern = key
        if points < 0:
          # Only reverse false _positives_.  Negatives probably aren't false.
          continue
        logging.info(
            'Undoing %d points for %d tags, with %s matching %s',
            points, len(tags), attr, unused_pattern)
        for tag in tags:
          util.ApplyScore(tag, -1 * points, name=attr)
Beispiel #2
0
def _ScoreImages(root_tag):
  """Score up images."""
  for tag in root_tag.findAll('img'):
    util.ApplyScore(tag, 1.5, name='any_img')
    if tag.has_key('alt') and len(tag['alt']) > 50:
      util.ApplyScore(tag, 2, name='img_alt')

    size = _TagSize(tag)
    if size is None:
      continue
    if size <= 625:
      util.ApplyScore(tag, -1.5, name='tiny_img')
    if size >= 50000:
      util.ApplyScore(tag, 3, name='has_img')
    if size >= 250000:
      util.ApplyScore(tag, 4, name='big_img')
Beispiel #3
0
def _ScoreBlocks(root_tag):
  """Score up all leaf block nodes, based on the length of their text."""
  for leaf_block in _FindLeafBlocks(root_tag):
    # Length of stripped text, with all whitespace collapsed.
    text_len = _TextLenNonAnchors(leaf_block)

    if text_len == 0:
      anchor = leaf_block.find('a')
      img = leaf_block.find('img')
      if anchor and not anchor.has_key('score_out_link') and not img:
        util.ApplyScore(leaf_block, -2, name='only_anchor')
      continue
    if text_len < 20:
      util.ApplyScore(leaf_block, -0.75, name='short_text')
    if text_len > 50:
      util.ApplyScore(leaf_block, 3, name='some_text')
    if text_len > 250:
      util.ApplyScore(leaf_block, 4, name='more_text')
Beispiel #4
0
def _Score(tag, url, hit_counter):
    if tag.name == 'body': return
    for points, attr, pattern in ATTR_POINTS:
        if not tag.has_key(attr): continue
        if pattern.search(tag[attr]):
            util.ApplyScore(tag, points, name=attr)

            key = (points, attr, pattern.pattern)
            hit_counter.setdefault(key, [])
            hit_counter[key].append(tag)

    if tag.name == 'a' and tag.has_key('href'):
        that_url = urlparse.urljoin(url, tag['href'])
        if url in that_url or url in urllib.unquote(tag['href']):
            # Special case: score down AND strip links to this page.  (Including
            # "social media" links.)
            util.ApplyScore(tag, -1.5, name='self_link')
            tag.extract()
        # TODO: host name -> domain name
        elif urlparse.urlparse(url)[1] != urlparse.urlparse(that_url)[1]:
            # Score up links to _other_ domains.
            util.ApplyScore(tag, 1.0, name='out_link')
Beispiel #5
0
def _Score(tag, url, hit_counter):
  if tag.name == 'body': return

  # Point patterns.
  for points, attr, pattern in ATTR_POINTS:
    if not tag.has_key(attr): continue
    if pattern.search(tag[attr]):
      parent_match = tag.parent and tag.parent.has_key(attr) and (
          pattern.search(tag.parent[attr]))
      if not parent_match:
        util.ApplyScore(tag, points, name=attr)

      key = (points, attr, pattern.pattern)
      hit_counter.setdefault(key, [])
      hit_counter[key].append(tag)

  # Links.
  if tag.name == 'a' and tag.has_key('href') and not tag.has_key('score_href'):
    that_url = urlparse.urljoin(url, tag['href'])
    if url == that_url or url == urllib.unquote(tag['href']):
      # Special case: score down AND strip links to this page.  (Including
      # "social media" links.)
      util.ApplyScore(tag, -1.5, name='self_link')
      util.Strip(tag)
    # TODO: host name -> domain name
    elif urlparse.urlparse(url)[1] != urlparse.urlparse(that_url)[1]:
      # Score up links to _other_ domains.
      util.ApplyScore(tag, 1.0, name='out_link')

  # Blocks.
  if _IsLeafBlock(tag):
    # Length of stripped text, with all whitespace collapsed.
    text_len = _TextLen(tag)

    if text_len == 0:
      anchor = tag.find('a')
      img = tag.find('img')
      if anchor and not anchor.has_key('score_out_link') and not img:
        util.ApplyScore(tag, -2, name='only_anchor')
    else:
      if text_len < 20:
        util.ApplyScore(tag, -0.75, name='short_text')
      if text_len > 50:
        util.ApplyScore(tag, 3, name='some_text')
      if text_len > 250:
        util.ApplyScore(tag, 4, name='more_text')

  # Images.
  if tag.name == 'img':
    util.ApplyScore(tag, 1.5, name='any_img')
    if tag.has_key('alt') and len(tag['alt']) > 50:
      util.ApplyScore(tag, 2, name='img_alt')

    size = _TagSize(tag)
    if size is not None:
      if size <= 625:
        util.ApplyScore(tag, -1.5, name='tiny_img')
      if size >= 50000:
        util.ApplyScore(tag, 3, name='has_img')
      if size >= 250000:
        util.ApplyScore(tag, 4, name='big_img')

  # Embeds.
  if tag.name in util.EMBED_NAMES or (
      tag.name == 'iframe' and tag.has_key('src') and (
          'youtube.com' in tag['src']
          or 'youtube-nocookie.com' in tag['src']
          or 'vimeo.com' in tag['src']
          )):
    size = _TagSize(tag)
    if size > 10000:
      util.ApplyScore(tag, 15, name='has_embed')
Beispiel #6
0
def _SiteSpecific(url, root_tag):
  if 'www.cracked.com' in url:
    tag = root_tag.find(attrs={'class': 'Column2'})
    if tag: tag.extract()
    tag = root_tag.find(attrs={'class': 'userStyled'})
    if tag: util.ApplyScore(tag, 20, name='special')
Beispiel #7
0
def _ScoreEmbeds(root_tag):
  """Score up objects/embeds."""
  for tag in util.FindEmbeds(root_tag):
    size = _TagSize(tag)
    if size > 10000:
      util.ApplyScore(tag, 15, name='has_embed')
Beispiel #8
0
    return soup, u''

  title = soup.find('title')
  title = title and title.text.lower() or ''

  _TransformBrsToParagraphs(soup)
  patterns.Process(soup, url)
  _ScoreBlocks(soup)
  _ScoreImages(soup)
  _ScoreEmbeds(soup)
  _SiteSpecific(url, soup)

  # If a header repeats the title, strip it and all preceding nodes.
  title_header = _FindTitleHeader(soup, title)
  if title_header:
    util.ApplyScore(title_header, 11, name='title_header')
    if 'flickr.com' not in url:
      _StripBefore(title_header)

  # Get the highest scored nodes.
  scored_nodes = sorted(soup.findAll(attrs={'score': True}),
                        key=lambda x: x['score'])[-15:]
  if not scored_nodes:
    return soup, u'<p>Scoring error.</p>'
  best_node = scored_nodes[-1]

  _TransformDivsToPs(best_node)
  _StripLowScored(best_node)

  # For debugging ...
  if util.IS_DEV_APPSERVER: