def test_refetch_hfeed(self): """refetch should grab resources again, even if they were previously marked with a blank SyndicatedPost """ # refetch 1 and 3 to see if they've been updated, 2 has already # been resolved for this source SyndicatedPost(parent=self.source.key, original="http://author/permalink1", syndication=None).put() SyndicatedPost( parent=self.source.key, original="http://author/permalink2", syndication="https://fa.ke/post/url2" ).put() SyndicatedPost(parent=self.source.key, original="http://author/permalink3", syndication=None).put() self.expect_requests_get( "http://author", """ <html class="h-feed"> <a class="h-entry" href="/permalink1"></a> <a class="h-entry" href="/permalink2"></a> <a class="h-entry" href="/permalink3"></a> </html>""", ) # yay, permalink1 has an updated syndication url self.expect_requests_get( "http://author/permalink1", """ <html class="h-entry"> <a class="u-url" href="/permalink1"></a> <a class="u-syndication" href="https://fa.ke/post/url1"></a> </html>""", ).InAnyOrder() # permalink2 hasn't changed since we first checked it self.expect_requests_get( "http://author/permalink2", """ <html class="h-entry"> <a class="u-url" href="/permalink2"></a> <a class="u-syndication" href="https://fa.ke/post/url2"></a> </html>""", ).InAnyOrder() # permalink3 hasn't changed since we first checked it self.expect_requests_get( "http://author/permalink3", """ <html class="h-entry"> <a class="u-url" href="/permalink3"></a> </html>""", ).InAnyOrder() self.mox.ReplayAll() refetch(self.source) self.assert_syndicated_posts( ("http://author/permalink1", "https://fa.ke/post/url1"), ("http://author/permalink2", "https://fa.ke/post/url2"), ("http://author/permalink3", None), )
def test_refetch_multiple_responses_same_activity(self): """Ensure that refetching a post that has several replies does not generate duplicate original -> None blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ for activity in self.activities: activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'https://fa.ke/post/url' author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""" author_entry = """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </html>""" # original self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) # refetch self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) self.mox.ReplayAll() for activity in self.activities: discover(self.source, activity) refetch(self.source) self.assert_syndicated_posts(('http://author/post/permalink', None), (None, 'https://fa.ke/post/url'))
def test_multiple_refetches(self): """Ensure that multiple refetches of the same post (with and without u-syndication) does not generate duplicate blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" unsyndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""" syndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" # first attempt, no syndication url yet self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # refetch, still no syndication url self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # second refetch, has a syndication url this time self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', syndicated) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertIsNone(relations[0].syndication) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
def test_refetch_multiple_responses_same_activity(self): """Ensure that refetching a post that has several replies does not generate duplicate original -> None blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ source = self.sources[0] source.domain_urls = ['http://author'] for activity in self.activities: activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'https://fa.ke/post/url' author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""" author_entry = """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </html>""" # original self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) # refetch self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) self.mox.ReplayAll() for activity in self.activities: original_post_discovery.discover(source, activity) original_post_discovery.refetch(source) rels_by_original = list( SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_original)) self.assertIsNone(rels_by_original[0].syndication) rels_by_syndication = list( SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_syndication)) self.assertIsNone(rels_by_syndication[0].original)
def test_refetch_deleted_syndication(self): """Deleted syndication links that have disappeared since our last fetch.""" SyndicatedPost( parent=self.source.key, original="http://author/permalink", syndication="https://fa.ke/post/url" ).put() self.expect_requests_get( "http://author", """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> </div> </html>""", ) self.expect_requests_get( "http://author/permalink", """ <html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""", ) self.mox.ReplayAll() self.assert_equals({}, refetch(self.source)) self.assert_syndicated_posts(("http://author/permalink", None))
def test_refetch_multiple_domain_urls(self): """We should refetch all of a source's URLs.""" self._expect_multiple_domain_url_fetches() result = refetch(self.source) self.assert_equals(['https://fa.ke/A' ,'https://fa.ke/B'], result.keys()) self.assert_syndicated_posts(('http://author1/A', 'https://fa.ke/A'), ('http://author3/B', 'https://fa.ke/B'))
def test_refetch_unchanged_syndication(self): """We should preserve unchanged SyndicatedPosts during refetches.""" synd = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url') synd.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </div> </html>""") self.mox.ReplayAll() refetch(self.source) self.assert_entities_equal([synd], list(SyndicatedPost.query()))
def refetch_hfeed(self, source): """refetch and reprocess the author's url, looking for new or updated syndication urls that we may have missed the first time we looked for them. """ logging.debug('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) if not relationships: return logging.debug('refetch h-feed found %d new rel=syndication relationships', len(relationships)) # grab the Responses and see if any of them have a a syndication # url matching one of the newly discovered relationships. We'll # check each response until we've seen all of them or until # the 60s timer runs out. # TODO maybe add a (canonicalized) url field to Response so we can # query by it instead of iterating over all of them for response in (Response.query(Response.source == source.key) .order(-Response.created)): if response.activity_json: # handle old entities response.activities_json.append(response.activity_json) response.activity_json = None new_orig_urls = set() for activity_json in response.activities_json: activity = json.loads(activity_json) activity_url = activity.get('url') or activity.get('object', {}).get('url') if not activity_url: logging.warning('activity has no url %s', activity_json) continue activity_url = source.canonicalize_syndication_url(activity_url) # look for activity url in the newly discovered list of relationships for relationship in relationships.get(activity_url, []): # won't re-propagate if the discovered link is already among # these well-known upstream duplicates if relationship.original in response.sent: logging.info( '%s found a new rel=syndication link %s -> %s, but the ' 'relationship had already been discovered by another method', response.label(), relationship.original, relationship.syndication) else: logging.info( '%s found a new rel=syndication link %s -> %s, and ' 'will be repropagated with a new target!', response.label(), relationship.original, relationship.syndication) new_orig_urls.add(relationship.original) if new_orig_urls: # re-open a previously 'complete' propagate task response.status = 'new' response.unsent.extend(list(new_orig_urls)) response.put() response.add_task()
def test_multiple_refetches(self): """Ensure that multiple refetches of the same post (with and without u-syndication) does not generate duplicate blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" unsyndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""" syndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" # first attempt, no syndication url yet self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # refetch, still no syndication url self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # second refetch, has a syndication url this time self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', syndicated) self.mox.ReplayAll() discover(self.source, self.activities[0]) refetch(self.source) self.assert_syndicated_posts(('http://author/permalink', None), (None, u'https://fa.ke/post/url')) refetch(self.source) self.assert_syndicated_posts(('http://author/permalink', 'https://fa.ke/post/url'))
def test_refetch_two_permalinks_same_syndication(self): """ This causes a problem if refetch assumes that syndication-url is unique under a given source. """ source = self.sources[0] source.domain_urls = ['http://author'] self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/post1"></a> <a class="h-entry" href="/post2"></a> </html>""" self.expect_requests_get('http://author', hfeed) for i in range(2): self.expect_requests_get( 'http://author/post%d' % (i + 1), """<html class="h-entry"> <a class="u-url" href="/post%d"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" % (i + 1)) # refetch should only grab the feed self.expect_requests_get('http://author', hfeed) self.mox.ReplayAll() activity = original_post_discovery.discover(source, self.activities[0]) self.assertItemsEqual(['http://author/post1', 'http://author/post2'], activity['object'].get('upstreamDuplicates')) relations = SyndicatedPost.query(ancestor=source.key).fetch() self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'), ('http://author/post2', 'https://fa.ke/post/url')], [(relation.original, relation.syndication) for relation in relations]) # discover should have already handled all relationships, refetch should # not find anything refetch_result = original_post_discovery.refetch(source) self.assertFalse(refetch_result)
def test_refetch_two_permalinks_same_syndication(self): """ This causes a problem if refetch assumes that syndication-url is unique under a given source. """ self.activities[0]["object"].update( {"content": "post content without backlinks", "url": "https://fa.ke/post/url"} ) hfeed = """<html class="h-feed"> <a class="h-entry" href="/post1"></a> <a class="h-entry" href="/post2"></a> </html>""" hentries = [ ( "http://author/post%d" % (i + 1), """<html class="h-entry"> <a class="u-url" href="/post%d"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" % (i + 1), ) for i in range(2) ] self.expect_requests_get("http://author", hfeed) for permalink, content in hentries: self.expect_requests_get(permalink, content) # refetch self.expect_requests_get("http://author", hfeed) for permalink, content in hentries: self.expect_requests_get(permalink, content) self.mox.ReplayAll() self.assert_discover(["http://author/post1", "http://author/post2"]) self.assert_syndicated_posts( ("http://author/post1", "https://fa.ke/post/url"), ("http://author/post2", "https://fa.ke/post/url") ) # discover should have already handled all relationships, refetch should # not find anything self.assertFalse(refetch(self.source))
def test_refetch_permalink_with_two_syndications(self): """Test one permalink with two syndicated posts. Make sure that refetch doesn't have a problem with two entries for the same original URL. """ for idx, activity in enumerate(self.activities): activity['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url%d' % (idx + 1), }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" hentry = """<html class="h-entry"> <a class="u-url" href="/permalink"/> <a class="u-syndication" href="https://fa.ke/post/url1"/> <a class="u-syndication" href="https://fa.ke/post/url3"/> <a class="u-syndication" href="https://fa.ke/post/url5"/> </html>""" self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', hentry) # refetch self.expect_requests_get('http://author', hfeed) # refetch grabs posts that it's seen before in case there have # been updates self.expect_requests_get('http://author/permalink', hentry) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) relations = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch() self.assertItemsEqual( [('http://author/permalink', 'https://fa.ke/post/url1'), ('http://author/permalink', 'https://fa.ke/post/url3'), ('http://author/permalink', 'https://fa.ke/post/url5')], [(r.original, r.syndication) for r in relations]) results = original_post_discovery.refetch(self.source) self.assertFalse(results)
def test_refetch_changed_syndication(self): """Update syndication links that have changed since our last fetch.""" SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url').put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="http://fa.ke/changed/url"></a> </div> </html>""") self.mox.ReplayAll() results = refetch(self.source) self.assert_syndicated_posts( ('http://author/permalink', 'https://fa.ke/changed/url')) self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())}, results)
def test_refetch_blank_syndication(self): """We should preserve blank SyndicatedPosts during refetches.""" blank = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication=None) blank.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> </div> </html>""") self.expect_requests_get('http://author/permalink', """ <html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""") self.mox.ReplayAll() self.assert_equals({}, refetch(self.source)) self.assert_syndicated_posts(('http://author/permalink', None))
def test_refetch_permalink_with_two_syndications(self): """Test one permalink with two syndicated posts. Make sure that refetch doesn't have a problem with two entries for the same original URL. """ for idx, activity in enumerate(self.activities): activity['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url%d' % (idx + 1), }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" hentry = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url1"></a> <a class="u-syndication" href="https://fa.ke/post/url3"></a> <a class="u-syndication" href="https://fa.ke/post/url5"></a> </html>""" self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', hentry) # refetch self.expect_requests_get('http://author', hfeed) # refetch grabs posts that it's seen before in case there have been updates self.expect_requests_get('http://author/permalink', hentry) self.mox.ReplayAll() discover(self.source, self.activities[0]) self.assert_syndicated_posts( ('http://author/permalink', 'https://fa.ke/post/url1'), ('http://author/permalink', 'https://fa.ke/post/url3'), ('http://author/permalink', 'https://fa.ke/post/url5')) self.assertFalse(refetch(self.source))
def test_refetch_hfeed(self): """refetch should grab resources again, even if they were previously marked with a blank SyndicatedPost """ source = self.sources[0] source.domain_urls = ['http://author'] # refetch 1 and 3 to see if they've been updated, 2 has already # been resolved for this source SyndicatedPost(parent=source.key, original='http://author/permalink1', syndication=None).put() SyndicatedPost(parent=source.key, original='http://author/permalink2', syndication='https://fa.ke/post/url2').put() SyndicatedPost(parent=source.key, original='http://author/permalink3', syndication=None).put() self.expect_requests_get('http://author', """ <html class="h-feed"> <a class="h-entry" href="/permalink1"></a> <a class="h-entry" href="/permalink2"></a> <a class="h-entry" href="/permalink3"></a> </html>""") # yay, permalink1 has an updated syndication url self.expect_requests_get('http://author/permalink1', """ <html class="h-entry"> <a class="u-url" href="/permalink1"></a> <a class="u-syndication" href="https://fa.ke/post/url1"></a> </html>""").InAnyOrder() # permalink3 hasn't changed since we first checked it self.expect_requests_get('http://author/permalink3', """ <html class="h-entry"> <a class="u-url" href="/permalink3"></a> </html>""").InAnyOrder() self.mox.ReplayAll() original_post_discovery.refetch(source) relationships1 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink1', ancestor=source.key).fetch() self.assertTrue(relationships1) self.assertEquals('https://fa.ke/post/url1', relationships1[0].syndication) relationships2 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink2', ancestor=source.key).fetch() # this shouldn't have changed self.assertTrue(relationships2) self.assertEquals('https://fa.ke/post/url2', relationships2[0].syndication) relationships3 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink3', ancestor=source.key).fetch() self.assertTrue(relationships3) self.assertIsNone(relationships3[0].syndication)
def test_refetch_with_updated_permalink(self): """Permalinks can change (e.g., if a stub is added or modified). This causes a problem if refetch assumes that syndication-url is unique under a given source. """ source = self.sources[0] source.domain_urls = ['http://author'] self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) # first attempt, no stub yet self.expect_requests_get('http://author', """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09"></a> </html>""") self.expect_requests_get('http://author/2014/08/09', """ <html class="h-entry"> <a class="u-url" href="/2014/08/09"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""") # refetch, permalink has a stub now self.expect_requests_get('http://author', """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a> </html>""") self.expect_requests_get('http://author/2014/08/09/this-is-a-stub', """ <html class="h-entry"> <a class="u-url" href="/2014/08/09/this-is-a-stub"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""") # refetch again (feed-only this time) self.expect_requests_get('http://author', """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a> </html>""") self.mox.ReplayAll() activity = original_post_discovery.discover(source, self.activities[0]) # modified activity should have /2014/08/09 as an upstreamDuplicate now self.assertEquals(['http://author/2014/08/09'], activity['object']['upstreamDuplicates']) # refetch should find the updated original url -> syndication url. # it should *not* find the previously discovered relationship. first_results = original_post_discovery.refetch(source) self.assertEquals(1, len(first_results)) new_relations = first_results.get('https://fa.ke/post/url') self.assertEquals(1, len(new_relations)) self.assertEquals('https://fa.ke/post/url', new_relations[0].syndication) self.assertEquals('http://author/2014/08/09/this-is-a-stub', new_relations[0].original) # second refetch should find nothing because nothing has changed # since the previous refetch. second_results = original_post_discovery.refetch(source) self.assertFalse(second_results)
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}) self.backfeed(source, responses, activities=activities) source.updates.update({'last_polled': source.last_poll_attempt, 'poll_status': 'ok'}) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info('refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def test_refetch_with_updated_permalink(self): """Permalinks can change (e.g., if a stub is added or modified). This causes a problem if refetch assumes that syndication-url is unique under a given source. """ self.activities[0]["object"].update( {"content": "post content without backlinks", "url": "https://fa.ke/post/url"} ) # first attempt, no stub yet self.expect_requests_get( "http://author", """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09"></a> </html>""", ) self.expect_requests_get( "http://author/2014/08/09", """ <html class="h-entry"> <a class="u-url" href="/2014/08/09"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""", ) # refetch, permalink has a stub now self.expect_requests_get( "http://author", """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a> </html>""", ) self.expect_requests_get( "http://author/2014/08/09/this-is-a-stub", """ <html class="h-entry"> <a class="u-url" href="/2014/08/09/this-is-a-stub"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""", ) # refetch again self.expect_requests_get( "http://author", """ <html class="h-feed"> <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a> </html>""", ) # permalink hasn't changed self.expect_requests_get( "http://author/2014/08/09/this-is-a-stub", """ <html class="h-entry"> <a class="u-url" href="/2014/08/09/this-is-a-stub"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""", ) self.mox.ReplayAll() # modified activity should have /2014/08/09 as an upstreamDuplicate now self.assert_discover(["http://author/2014/08/09"]) # refetch should find the updated original url -> syndication url. # it should *not* find the previously discovered relationship. first_results = refetch(self.source) self.assertEquals(1, len(first_results)) new_relations = first_results.get("https://fa.ke/post/url") self.assertEquals(1, len(new_relations)) self.assertEquals("https://fa.ke/post/url", new_relations[0].syndication) self.assertEquals("http://author/2014/08/09/this-is-a-stub", new_relations[0].original) # second refetch should find nothing because nothing has changed # since the previous refetch. self.assertFalse(refetch(self.source))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json_loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response(fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = str(id) > str(last_activity_id) if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json_dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) self.backfeed(source, responses, activities=activities) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException as e: if ('BadRequestError' in str(e.__class__) or 'Timeout' in str(e.__class__) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', stack_info=True) else: raise else: logging.info( 'skipping refetch h-feed. last-syndication-url %s, last-refetch %s', source.last_syndication_url, source.last_hfeed_refetch)
def receive(self, email): addr = self.request.path.split('/')[-1] message_id = email.original.get('message-id').strip('<>') sender = getattr(email, 'sender', None) to = getattr(email, 'to', None) cc = getattr(email, 'cc', None) subject = getattr(email, 'subject', None) logging.info('Received %s from %s to %s (%s) cc %s: %s', message_id, sender, to, addr, cc, subject) addr = self.request.path.split('/')[-1] user = addr.split('@')[0] source = FacebookEmailAccount.query( FacebookEmailAccount.email_user == user).get() logging.info('Source for %s is %s', user, source) util.email_me(subject='New email from %s: %s' % (sender, subject), body='Source: %s' % (source.bridgy_url(self) if source else None)) htmls = list(body.decode() for _, body in email.bodies('text/html')) fbe = FacebookEmail.get_or_insert( message_id, source=source.key if source else None, htmls=htmls) logging.info('FacebookEmail created %s: %s', fbe.created, fbe.key.urlsafe()) if not source: self.response.status_code = 404 self.response.write( 'No Facebook email user found with address %s' % addr) return for html in htmls: obj = gr_facebook.Facebook.email_to_object(html) if obj: break else: self.response.status_code = 400 self.response.write('No HTML body could be parsed') return logging.info('Converted to AS1: %s', json.dumps(obj, indent=2)) base_obj = source.gr_source.base_object(obj) # note that this ignores the id query param (the post's user id) and uses # the source object's user id instead. base_obj['url'] = source.canonicalize_url(base_obj['url']) # also note that base_obj['id'] is not a tag URI, it's the raw Facebook post # id, eg '104790764108207'. we don't use it from activities_json much, # though, just in PropagateResponse.source_url(), which handles this fine. original_post_discovery.refetch(source) targets, mentions = original_post_discovery.discover(source, base_obj, fetch_hfeed=False) logging.info('Got targets %s mentions %s', targets, mentions) resp = Response(id=obj['id'], source=source.key, type=Response.get_type(obj), response_json=json.dumps(obj), activities_json=[json.dumps(base_obj)], unsent=targets) resp.get_or_save(source, restart=True) fbe.response = resp.key fbe.put()