Example #1
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logging.info('Source: %s %s', source.label(), source.key.string_id())
    logging.info('Raw feed: %s', feed)

    if source.status != 'enabled':
        logging.info('Dropping because source is %s', source.status)
        return
    elif 'webmention' not in source.features:
        logging.info("Dropping because source doesn't have webmention feature")
        return

    for item in json.loads(feed).get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logging.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(l))
            for l in util.extract_links(content)
            if util.domain_from_link(l) not in source.domains
        ]

        logging.info('Found links: %s', links)
        if len(url) > _MAX_KEYPART_BYTES:
            logging.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=links)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=links)

        bp.get_or_save()
Example #2
0
def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.info('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.info("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.clean_url(util.unwrap_t_umblr_com(l))
             for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    unique = []
    for link in util.dedupe_urls(links):
      if len(link) <= _MAX_STRING_LENGTH:
        unique.append(link)
      else:
        logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link)

    logging.info('Found links: %s', unique)
    if len(url) > _MAX_KEYPART_BYTES:
      logging.warning('Blog post URL is too long (over 500 chars)! Giving up.')
      bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key,
                           feed_item=item, failed=unique)
    else:
      bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique)

    bp.get_or_save()
Example #3
0
def handle_feed(feed, source):
  """Handles a Superfeedr JSON feed.

  Creates BlogPost entities and adds propagate-blogpost tasks for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
  logging.info('Source: %s %s', source.label(), source.key.string_id())
  logging.info('Raw feed: %s', feed)

  if source.status != 'enabled':
    logging.warning('Dropping because source is %s', source.status)
    return
  elif 'webmention' not in source.features:
    logging.warning("Dropping because source doesn't have webmention feature")
    return

  for item in json.loads(feed).get('items', []):
    url = item.get('permalinkUrl') or item.get('id')
    if not url:
      logging.error('Dropping feed item without permalinkUrl or id!')
      continue

    # extract links from content, discarding self links.
    #
    # i don't use get_webmention_target[s]() here because they follows redirects
    # and fetch link contents, and this handler should be small and fast and try
    # to return a response to superfeedr successfully.
    #
    # TODO: extract_links currently has a bug that makes it drop trailing
    # slashes. ugh. fix that.
    content = item.get('content') or item.get('summary', '')
    links = [util.unwrap_t_umblr_com(l) for l in util.extract_links(content)
             if util.domain_from_link(l) not in source.domains]

    logging.info('Found links: %s', links)
    models.BlogPost(id=url,
                    source=source.key,
                    feed_item=item,
                    unsent=links,
                    ).get_or_save()
Example #4
0
def handle_feed(feed, source):
    """Handles a Superfeedr JSON feed.

  Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks
  for new items.

  http://documentation.superfeedr.com/schema.html#json
  http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications

  Args:
    feed: unicode string, Superfeedr JSON feed
    source: Blogger, Tumblr, or WordPress
  """
    logger.info(f'Source: {source.label()} {source.key_id()}')
    logger.info(f'Raw feed: {feed}')

    if not feed:
        return

    if source.status != 'enabled':
        logger.info(f'Dropping because source is {source.status}')
        return
    elif 'webmention' not in source.features:
        logger.info("Dropping because source doesn't have webmention feature")
        return

    for item in feed.get('items', []):
        url = item.get('permalinkUrl') or item.get('id')
        if not url:
            logger.error('Dropping feed item without permalinkUrl or id!')
            continue

        # extract links from content, discarding self links.
        #
        # i don't use get_webmention_target[s]() here because they follows redirects
        # and fetch link contents, and this handler should be small and fast and try
        # to return a response to superfeedr successfully.
        #
        # TODO: extract_links currently has a bug that makes it drop trailing
        # slashes. ugh. fix that.
        content = item.get('content') or item.get('summary', '')
        links = [
            util.clean_url(util.unwrap_t_umblr_com(url))
            for url in util.extract_links(content)
            if util.domain_from_link(url) not in source.domains
        ]

        unique = []
        for link in util.dedupe_urls(links):
            if len(link) <= _MAX_STRING_LENGTH:
                unique.append(link)
            else:
                logger.info(
                    f'Giving up on link over {_MAX_STRING_LENGTH} chars! {link}'
                )
            if len(unique) >= MAX_BLOGPOST_LINKS:
                logger.info('Stopping at 10 links! Skipping the rest.')
                break

        logger.info(f'Found links: {unique}')
        if len(url) > _MAX_KEYPART_BYTES:
            logger.warning(
                'Blog post URL is too long (over 500 chars)! Giving up.')
            bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES],
                                 source=source.key,
                                 feed_item=item,
                                 failed=unique)
        else:
            bp = models.BlogPost(id=url,
                                 source=source.key,
                                 feed_item=item,
                                 unsent=unique)

        bp.get_or_save()