def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [ util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains ] logging.info('Found links: %s', links) if len(url) > _MAX_KEYPART_BYTES: logging.warning( 'Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=links) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links) bp.get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.info('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.info("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.clean_url(util.unwrap_t_umblr_com(l)) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logging.info('Giving up on link over %s chars! %s', _MAX_STRING_LENGTH, link) logging.info('Found links: %s', unique) if len(url) > _MAX_KEYPART_BYTES: logging.warning('Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates BlogPost entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logging.info('Source: %s %s', source.label(), source.key.string_id()) logging.info('Raw feed: %s', feed) if source.status != 'enabled': logging.warning('Dropping because source is %s', source.status) return elif 'webmention' not in source.features: logging.warning("Dropping because source doesn't have webmention feature") return for item in json.loads(feed).get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logging.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [util.unwrap_t_umblr_com(l) for l in util.extract_links(content) if util.domain_from_link(l) not in source.domains] logging.info('Found links: %s', links) models.BlogPost(id=url, source=source.key, feed_item=item, unsent=links, ).get_or_save()
def handle_feed(feed, source): """Handles a Superfeedr JSON feed. Creates :class:`models.BlogPost` entities and adds propagate-blogpost tasks for new items. http://documentation.superfeedr.com/schema.html#json http://documentation.superfeedr.com/subscribers.html#pubsubhubbubnotifications Args: feed: unicode string, Superfeedr JSON feed source: Blogger, Tumblr, or WordPress """ logger.info(f'Source: {source.label()} {source.key_id()}') logger.info(f'Raw feed: {feed}') if not feed: return if source.status != 'enabled': logger.info(f'Dropping because source is {source.status}') return elif 'webmention' not in source.features: logger.info("Dropping because source doesn't have webmention feature") return for item in feed.get('items', []): url = item.get('permalinkUrl') or item.get('id') if not url: logger.error('Dropping feed item without permalinkUrl or id!') continue # extract links from content, discarding self links. # # i don't use get_webmention_target[s]() here because they follows redirects # and fetch link contents, and this handler should be small and fast and try # to return a response to superfeedr successfully. # # TODO: extract_links currently has a bug that makes it drop trailing # slashes. ugh. fix that. content = item.get('content') or item.get('summary', '') links = [ util.clean_url(util.unwrap_t_umblr_com(url)) for url in util.extract_links(content) if util.domain_from_link(url) not in source.domains ] unique = [] for link in util.dedupe_urls(links): if len(link) <= _MAX_STRING_LENGTH: unique.append(link) else: logger.info( f'Giving up on link over {_MAX_STRING_LENGTH} chars! {link}' ) if len(unique) >= MAX_BLOGPOST_LINKS: logger.info('Stopping at 10 links! Skipping the rest.') break logger.info(f'Found links: {unique}') if len(url) > _MAX_KEYPART_BYTES: logger.warning( 'Blog post URL is too long (over 500 chars)! Giving up.') bp = models.BlogPost(id=url[:_MAX_KEYPART_BYTES], source=source.key, feed_item=item, failed=unique) else: bp = models.BlogPost(id=url, source=source.key, feed_item=item, unsent=unique) bp.get_or_save()