def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert(self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEqual('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs)) self.assertEqual('http://original/newly-discovered', rs[0].original) self.assertEqual('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug('previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring)), preexisting) success = True # fetch the full permalink page, which often has more detailed information if not results: parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls( source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logger.debug( f'previously found relationship(s) for original {permalink}: {synds}' ) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) usynd_urls = {url for url in usynd if isinstance(url, str)} if usynd_urls: logger.debug( f'u-syndication links on the h-feed h-entry: {usynd_urls}') results = _process_syndication_urls(source, permalink, usynd_urls, preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details mf2 = None try: if type_ok: logger.debug(f'fetching post permalink {permalink}') mf2 = util.fetch_mf2(permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logger.info(f'Could not fetch permalink {permalink}', exc_info=True) success = False if mf2: syndication_urls = set() relsynd = mf2['rels'].get('syndication', []) if relsynd: logger.debug(f'rel-syndication links: {relsynd}') syndication_urls.update(url for url in relsynd if isinstance(url, str)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in mf2['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logger.debug(f'u-syndication links: {usynd}') syndication_urls.update(url for url in usynd if isinstance(url, str)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = list(itertools.chain(*results.values())) for syndpost in preexisting: if syndpost.syndication and syndpost not in result_syndposts: logger.info( f'deleting relationship that disappeared: {syndpost}') syndpost.key.delete() preexisting.remove(syndpost) if not results: logger.debug( f'no syndication links from {permalink} to current source {source.label()}.' ) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logger.debug( f'saving empty relationship so that {permalink} will not be searched again' ) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.items(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logger.debug(f'discovered relationships {new_results}') return new_results
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug( 'previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls( source, permalink, set(url for url in usynd if isinstance(url, basestring)), preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url: # fetch the full permalink page if we think it might have more details parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = util.mf2py_parse(resp.text, permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug( 'saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def _process_entry(source, permalink, feed_entry, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: a list of previously discovered models.SyndicatedPosts for this permalink Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ results = {} # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching blanks and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if refetch_blanks and all(not p.syndication for p in preexisting): logging.debug('ignoring blank relationship for original %s', permalink) else: return results # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink usynd = feed_entry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring))) # fetch the full permalink page, which often has more detailed information if not results: parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) if not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that it %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) logging.debug('discovered relationships %s', results) return results