def test_get_webmention_target_blacklisted_urls(self): for resolve in True, False: self.assertTrue(util.get_webmention_target( 'http://good.com/a', resolve=resolve)[2]) for bad in ('http://facebook.com/x', 'https://www.facebook.com/y', 'http://sub.dom.ain.facebook.com/z'): self.assertFalse(util.get_webmention_target(bad, resolve=resolve)[2], bad)
def test_get_webmention_target_blocklisted_urls(self): for resolve in True, False: self.assertTrue(util.get_webmention_target( 'http://good.com/a', resolve=resolve)[2]) for bad in ('http://facebook.com/x', 'https://www.facebook.com/y', 'http://sub.dom.ain.facebook.com/z'): self.assertFalse(util.get_webmention_target(bad, resolve=resolve)[2], bad)
def get_webmention_targets(source, activity): """Returns a set of string target URLs to attempt to send webmentions to. Side effect: runs the original post discovery algorithm on the activity and adds the resulting URLs to the activity as tags, in place. Args: source: models.Source subclass activity: activity dict """ original_post_discovery.discover(source, activity) targets = set() obj = activity.get('object') or activity for tag in obj.get('tags', []): url = tag.get('url') if url and tag.get('objectType') == 'article': url, domain, send = util.get_webmention_target(url) tag['url'] = url if send: targets.add(url) for url in obj.get('upstreamDuplicates', []): url, domain, send = util.get_webmention_target(url) if send: targets.add(url) return targets
def test_get_webmention_target_blacklisted_urls(self): gwt = util.get_webmention_target for bad in ('http://facebook.com/x', 'https://www.facebook.com/y', 'http://sub.dom.ain.facebook.com/z'): self.assertFalse(util.get_webmention_target(bad)[2], bad) self.assertTrue(util.get_webmention_target('http://good.com/a')[2])
def test_get_webmention_cleans_redirected_urls(self): self.expect_requests_head('http://foo/bar', redirected_url='http://final?utm_source=x') self.mox.ReplayAll() self.assert_equals(('http://final', 'final', True), util.get_webmention_target('http://foo/bar', resolve=True)) self.assert_equals(('http://foo/bar', 'foo', True), util.get_webmention_target('http://foo/bar', resolve=False))
def test_get_webmention_second_redirect_not_text_html(self): self.expect_requests_head('http://orig', redirected_url=['http://middle', 'https://end'], content_type='application/pdf') self.mox.ReplayAll() self.assert_equals(('https://end', 'end', False), util.get_webmention_target('http://orig', resolve=True))
def test_get_webmention_target_too_big(self): self.expect_requests_head('http://orig', response_headers={ 'Content-Length': str(util.MAX_HTTP_RESPONSE_SIZE + 1), }) self.mox.ReplayAll() self.assert_equals(('http://orig', 'orig', False), util.get_webmention_target('http://orig'))
def resolve_profile_url(url, resolve=True): """Resolves a profile URL to be added to a source. Args: url: string resolve: boolean, whether to make HTTP requests to follow redirects, etc. Returns: string, resolved URL, or None """ final, _, ok = util.get_webmention_target(url, resolve=resolve) if not ok: return None final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and resolve: root = match.group(1) try: resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root except requests.RequestException: logging.warning("Couldn't fetch %s, preserving path in %s", root, final, exc_info=True) return final
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _url_and_domain(self, auth_entity): """Returns this source's URL and domain. Uses the auth entity user_json 'url' field by default. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth Returns: (string url, string domain, boolean ok) tuple """ user_json = json.loads(auth_entity.user_json) actor = self.as_source.user_to_actor(user_json) urls = util.trim_nulls([actor.get('url')] + # also look at G+'s urls field [u.get('value') for u in user_json.get('urls', [])]) first_url = first_domain = None for url in urls: # TODO: fully support multiple urls for url in url.split(): url, domain, ok = util.get_webmention_target(url) if ok: domain = domain.lower() return url, domain, True elif not first_url: first_url = url first_domain = domain return first_url, first_domain, False
def test_get_webmention_middle_redirect_blacklisted(self): """We should allow blacklisted domains in the middle of a redirect chain. ...e.g. Google's redirector https://www.google.com/url?... """ self.expect_requests_head("http://orig", redirected_url=["https://www.google.com/url?xyz", "https://end"]) self.mox.ReplayAll() self.assert_equals(("https://end", "end", True), util.get_webmention_target("http://orig", resolve=True))
def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved
def resolve(urls): resolved = set() for url in urls: final, domain, send = util.get_webmention_target(url) if send and domain != source.gr_source.DOMAIN: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved
def test_get_webmention_middle_redirect_blocklisted(self): """We should allow blocklisted domains in the middle of a redirect chain. ...e.g. Google's redirector https://www.google.com/url?... """ self.expect_requests_head( 'http://orig', redirected_url=['https://www.google.com/url?xyz', 'https://end']) self.mox.ReplayAll() self.assert_equals(('https://end', 'end', True), util.get_webmention_target('http://orig', resolve=True))
def add_original_post_urls(self, post_id, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post_id: string post id obj: ActivityStreams post object prop: string property name in obj to add the original post URLs to """ post = None try: post = self.source.get_post(post_id) except: logging.warning('Error fetching source post %s', post_id, exc_info=True) return if not post: logging.warning('Source post %s not found', post_id) return original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() for url_list in obj[prop], obj.get('tags', []): for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) logging.info('After original post discovery, urls are: %s', seen)
def post(self): logging.debug('Params: %s', self.request.params) if self.lease(ndb.Key(urlsafe=self.request.params['key'])): source_domains = self.entity.source.get().domains to_send = set() for url in self.entity.unsent: url, domain, ok = util.get_webmention_target(url) # skip "self" links to this blog's domain if ok and domain not in source_domains: to_send.add(url) self.entity.unsent = list(to_send) self.send_webmentions()
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def dispatch_request(self): logger.debug(f'Params: {list(request.values.items())}') if not self.lease(ndb.Key(urlsafe=request.values['key'])): return ('', ERROR_HTTP_RETURN_CODE) if getattr(g, 'failed', None) else 'OK' to_send = set() for url in self.entity.unsent: url, domain, ok = util.get_webmention_target(url) # skip "self" links to this blog's domain if ok and domain not in g.source.domains: to_send.add(url) self.entity.unsent = list(to_send) self.send_webmentions() return ('', ERROR_HTTP_RETURN_CODE) if getattr(g, 'failed', None) else 'OK'
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info("Webmention from %s to %s", source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. cache_key = util.webmention_endpoint_cache_key(target) cached = memcache.get(cache_key) if cached: logging.info("Using cached webmention endpoint %r: %s", cache_key, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) logging.info("Sending...") try: if not mention.send(timeout=999, headers=util.REQUEST_HEADERS): error = mention.error except BaseException, e: logging.warning("", exc_info=True) error = getattr(mention, "error") if not error: error = ( {"code": "BAD_TARGET_URL", "http_status": 499} if "DNS lookup failed for URL:" in str(e) else {"code": "EXCEPTION"} ) error_code = error["code"] if error else None if error_code != "BAD_TARGET_URL" and not cached: val = error if error_code == "NO_ENDPOINT" else mention.receiver_endpoint memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME) if error is None: logging.info("Sent! %s", mention.response) self.record_source_webmention(mention) self.entity.sent.append(target) else: status = error.get("http_status", 0) if error_code == "NO_ENDPOINT" or (error_code == "BAD_TARGET_URL" and status == 204): # No Content logging.info("Giving up this target. %s", error) self.entity.skipped.append(target) elif status // 100 == 4: # Give up on 4XX errors; we don't expect later retries to succeed. logging.info("Giving up this target. %s", error) self.entity.failed.append(target) else: self.fail("Error sending to endpoint: %s" % error) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target)
def expand_target_urls(self, activity): """Expand the inReplyTo or object fields of an ActivityStreams object by fetching the original and looking for rel=syndication URLs. This method modifies the dict in place. Args: activity: an ActivityStreams dict of the activity being published """ for field in ('inReplyTo', 'object'): # microformats2.json_to_object de-dupes, no need to do it here objs = activity.get(field) if not objs: continue if isinstance(objs, dict): objs = [objs] augmented = list(objs) for obj in objs: url = obj.get('url') if not url: continue parsed = urllib.parse.urlparse(url) # ignore home pages. https://github.com/snarfed/bridgy/issues/760 if parsed.path in ('', '/'): continue # get_webmention_target weeds out silos and non-HTML targets # that we wouldn't want to download and parse url, _, ok = util.get_webmention_target(url) if not ok: continue logging.debug('expand_target_urls fetching field=%s, url=%s', field, url) try: mf2 = util.fetch_mf2(url) except AssertionError: raise # for unit tests except BaseException: # it's not a big deal if we can't fetch an in-reply-to url logging.info('expand_target_urls could not fetch field=%s, url=%s', field, url, stack_info=True) continue synd_urls = mf2['rels'].get('syndication', []) # look for syndication urls in the first h-entry queue = collections.deque(mf2.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type', [])) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue # these can be urls or h-cites synd_urls += microformats2.get_string_urls( item.get('properties', {}).get('syndication', [])) logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls) augmented += [{'url': u} for u in synd_urls] activity[field] = augmented
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', list(self.request.params.items())) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urllib.parse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in util.DOMAINS or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{flickr,github,mastodon,twitter}') elif source_cls == Instagram: return self.error('Sorry, %s is not supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urllib.parse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urllib.parse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. self.entity = self.get_or_add_publish_entity(url) try: resp = self.fetch_mf2(url, raise_errors=True) except BaseException as e: status, body = util.interpret_http_exception(e) if status == '410': return self.delete(url) return self.error('Could not fetch source URL %s' % url) if not resp: return self.fetched, mf2 = resp # create the Publish entity so we can store the result. if (self.entity.status == 'complete' and self.entity.type != 'preview' and not self.PREVIEW and not appengine_info.LOCAL): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't support updating existing posts. Details: https://github.com/snarfed/bridgy/issues/84", extra_json={'original': self.entity.published}) # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 shortlinks = mf2['rels'].get('shortlink') if shortlinks: self.shortlink = urllib.parse.urljoin(url, shortlinks[0]) # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(mf2.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException as e: code, body = util.interpret_http_exception(e) if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, or the token expired, so # disable this source. logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() # util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(), # body=body) if isinstance(e, (NotImplementedError, ValueError, urllib.error.URLError)): code = '400' elif not code: raise msg = 'Error: %s %s' % (body or '', e) return self.error(msg, status=code, report=code not in ('400', '404', '502', '503', '504')) if not self.entity.published: # tried all the items types.discard('h-entry') types.discard('h-note') if types: msg = ("%s doesn't support type(s) %s, or no content was found." % (source_cls.GR_CLASS.NAME, ' + '.join(types))) else: msg = 'Could not find content in <a href="http://microformats.org/wiki/h-entry">h-entry</a> or any other element!' return self.error(msg, data=mf2) # write results to datastore, but don't overwrite a previous publish with a # preview. if not (self.PREVIEW and self.entity.type != 'preview'): self.entity.status = 'complete' self.entity.put() return result
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug('previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring)), preexisting) success = True # fetch the full permalink page, which often has more detailed information if not results: parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls( source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def test_get_webmention_cleans_redirected_urls(self): self.expect_requests_head("http://foo/bar", redirected_url="http://final?utm_source=x") self.mox.ReplayAll() self.assert_equals(("http://final", "final", True), util.get_webmention_target("http://foo/bar", resolve=True)) self.assert_equals(("http://foo/bar", "foo", True), util.get_webmention_target("http://foo/bar", resolve=False))
def expand_target_urls(self, activity): """Expand the inReplyTo or object fields of an ActivityStreams object by fetching the original and looking for rel=syndication URLs. This method modifies the dict in place. Args: activity: an ActivityStreams dict of the activity being published """ for field in ('inReplyTo', 'object'): # microformats2.json_to_object de-dupes, no need to do it here objs = activity.get(field) if not objs: continue if isinstance(objs, dict): objs = [objs] augmented = list(objs) for obj in objs: url = obj.get('url') if not url: continue # get_webmention_target weeds out silos and non-HTML targets # that we wouldn't want to download and parse url, _, ok = util.get_webmention_target(url) if not ok: continue # fetch_mf2 raises a fuss if it can't fetch a mf2 document; # easier to just grab this ourselves than add a bunch of # special-cases to that method logging.debug('expand_target_urls fetching field=%s, url=%s', field, url) try: resp = util.requests_get(url) resp.raise_for_status() data = util.mf2py_parse(resp.text, url) except AssertionError: raise # for unit tests except BaseException: # it's not a big deal if we can't fetch an in-reply-to url logging.warning( 'expand_target_urls could not fetch field=%s, url=%s', field, url, exc_info=True) continue synd_urls = data.get('rels', {}).get('syndication', []) # look for syndication urls in the first h-entry queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type', [])) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue # these can be urls or h-cites synd_urls += microformats2.get_string_urls( item.get('properties', {}).get('syndication', [])) logging.debug( 'expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls) augmented += [{'url': u} for u in synd_urls] activity[field] = augmented
def test_get_webmention_target_not_http_https(self): self.assert_equals(('chrome://flags', 'flags', False), util.get_webmention_target('chrome://flags'))
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} logger.debug(f'fetching author url {author_url}') try: author_mf2 = util.fetch_mf2(author_url) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logger.info(f'Could not fetch author url {author_url}', exc_info=True) return {} feeditems = _find_feed_items(author_mf2) # try rel=feeds and rel=alternates feed_urls = set() candidates = (author_mf2['rels'].get('feed', []) + [ a.get('url') for a in author_mf2.get('alternates', []) if a.get('type') == MF2_HTML_MIME_TYPE ]) for feed_url in candidates: # check that it's html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logger.debug('author url is the feed url, ignoring') elif not feed_ok: logger.debug("skipping feed since it's not HTML or otherwise bad") else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logger.debug(f"fetching author's rel-feed {feed_url}") feed_mf2 = util.fetch_mf2(feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logger.info( f'rel-feed found new domain {domain}! adding to source' ) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logger.info(f'Could not fetch h-feed url {feed_url}.', exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') or '' feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logger.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, str): permalink_to_entry[permalink] = child else: logger.warning( f'unexpected non-string "url" property: {permalink}') max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logger.info(f'Hit cap of {max} permalinks. Stopping.') break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.items(): logger.debug(f'processing permalink: {permalink}') new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.items(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _process_author(source, author_url, refetch_blanks=False): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch_blanks: boolean, if true, refetch SyndicatedPosts that have previously been marked as not having a rel=syndication link Return: a dict of syndicated_url to models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author domain %s', author_url) author_resp = requests.get(author_url, timeout=HTTP_TIMEOUT) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} author_dom = BeautifulSoup(author_resp.text) author_parser = mf2py.Parser(url=author_url, doc=author_dom) author_parsed = author_parser.to_dict() # look for canonical feed url (if it isn't this one) using # rel='feed', type='text/html' for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, proceeding') break elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) continue try: logging.debug("fetching author's h-feed %s", feed_url) feed_resp = requests.get(feed_url, timeout=HTTP_TIMEOUT) feed_resp.raise_for_status() logging.debug("author's h-feed fetched successfully %s", feed_url) author_parsed = mf2py.Parser( url=feed_url, doc=feed_resp.text).to_dict() break except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) feeditems = author_parsed['items'] hfeed = next((item for item in feeditems if 'h-feed' in item['type']), None) if hfeed: feeditems = hfeed.get('children', []) else: logging.info('No h-feed found, fallback to top-level h-entrys.') permalinks = set() for child in feeditems: if 'h-entry' in child['type']: # TODO if this h-entry in the h-feed has u-syndication links, we # can just use it without fetching its permalink page # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): permalinks.add(permalink) # query all preexisting permalinks at once, instead of once per link preexisting = {r.original: r for r in SyndicatedPost.query_by_originals(source, permalinks)} results = {} for permalink in permalinks: logging.debug('processing permalink: %s', permalink) results.update(_process_entry(source, permalink, refetch_blanks, preexisting)) if results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = now_fn() logging.debug('updating source.last_syndication_url %s', now) source.last_syndication_url = now return results
def _process_entry(source, permalink, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: permalink: url of the unprocessed post syndication_url: url of the syndicated content refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: dict of original url to SyndicatedPost Return: a dict from syndicated url to new models.SyndicatedPosts """ results = {} preexisting_relationship = preexisting.get(permalink) # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting_relationship: # if we're refetching blanks and this one is blank, do not return if refetch_blanks and not preexisting_relationship.syndication: logging.debug('ignoring blank relationship for original %s', permalink) else: return results syndication_urls = set() parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(relsynd) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(usynd) # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) parsed = urlparse.urlparse(syndication_url) if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.get_or_insert_by_syndication_url( source, syndication=syndication_url, original=permalink) results[syndication_url] = relationship if not results: logging.debug('no syndication links from %s to current source %s. ' 'saving empty relationship so that it will not be ' 'searched again', permalink, source.label()) # remember that this post doesn't have syndication links for this # particular source SyndicatedPost(parent=source.key, original=permalink, syndication=None).put() logging.debug('discovered relationships %s', results) return results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def expand_target_urls(self, activity): """Expand the inReplyTo or object fields of an ActivityStreams object by fetching the original and looking for rel=syndication URLs. This method modifies the dict in place. Args: activity: an ActivityStreams dict of the activity being published """ for field in ('inReplyTo', 'object'): # microformats2.json_to_object de-dupes, no need to do it here objs = activity.get(field) if not objs: continue if isinstance(objs, dict): objs = [objs] augmented = list(objs) for obj in objs: url = obj.get('url') if not url: continue # get_webmention_target weeds out silos and non-HTML targets # that we wouldn't want to download and parse url, _, ok = util.get_webmention_target(url) if not ok: continue # fetch_mf2 raises a fuss if it can't fetch a mf2 document; # easier to just grab this ourselves than add a bunch of # special-cases to that method logging.debug('expand_target_urls fetching field=%s, url=%s', field, url) try: resp = util.requests_get(url) resp.raise_for_status() data = mf2py.Parser(url=url, doc=resp.text).to_dict() except AssertionError: raise # for unit tests except BaseException: # it's not a big deal if we can't fetch an in-reply-to url logging.warning('expand_target_urls could not fetch field=%s, url=%s', field, url, exc_info=True) continue synd_urls = data.get('rels', {}).get('syndication', []) # look for syndication urls in the first h-entry queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type', [])) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue # these can be urls or h-cites synd_urls += microformats2.get_string_urls( item.get('properties', {}).get('syndication', [])) logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls) augmented += [{'url': u} for u in synd_urls] activity[field] = augmented
def test_get_webmention_text_mf2_html(self): self.expect_requests_head('http://orig', content_type='text/mf2+html') self.mox.ReplayAll() self.assert_equals(('http://orig', 'orig', True), util.get_webmention_target('http://orig'))
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug( 'previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls( source, permalink, set(url for url in usynd if isinstance(url, basestring)), preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url: # fetch the full permalink page if we think it might have more details parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = util.mf2py_parse(resp.text, permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug( 'saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,github,twitter}') elif source_cls == Instagram: return self.error('Sorry, %s is not supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. self.entity = self.get_or_add_publish_entity(url) try: resp = self.fetch_mf2(url, raise_errors=True) except BaseException as e: status, body = util.interpret_http_exception(e) if status == '410': return self.delete(url) return self.error('Could not fetch source URL %s' % url) if not resp: return self.fetched, data = resp # create the Publish entity so we can store the result. if (self.entity.status == 'complete' and self.entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84") # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if code in self.source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, or the token expired, so # disable this source. logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() # TODO: eventually drop this to just if source.is_beta_user(). leaving # for everyone right now for initial monitoring. util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(), body=body) if isinstance(e, (NotImplementedError, ValueError, urllib2.URLError)): code = '400' elif not code: raise msg = 'Error: %s %s' % (body or '', e) return self.error(msg, status=code, mail=code not in ('400', '404', '502', '503', '504'))
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. cache_key = util.webmention_endpoint_cache_key(target) cached = util.webmention_endpoint_cache.get(cache_key) if cached: logging.info('Using cached webmention endpoint %r: %s', cache_key, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) headers = util.request_headers(source=self.source) logging.info('Sending...') try: if not mention.send(timeout=999, headers=headers): error = mention.error except BaseException as e: logging.info('', stack_info=True) error = getattr(mention, 'error') if not error: error = ({ 'code': 'BAD_TARGET_URL', 'http_status': 499 } if 'DNS lookup failed for URL:' in str(e) else { 'code': 'EXCEPTION' }) error_code = error['code'] if error else None if error_code != 'BAD_TARGET_URL' and not cached: val = error if error_code == 'NO_ENDPOINT' else mention.receiver_endpoint with util.webmention_endpoint_cache_lock: util.webmention_endpoint_cache[cache_key] = val if error is None: logging.info('Sent! %s', mention.response) self.record_source_webmention(mention) self.entity.sent.append(target) else: status = error.get('http_status', 0) if (error_code == 'NO_ENDPOINT' or (error_code == 'BAD_TARGET_URL' and status == 204)): # No Content logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) elif status // 100 == 4: # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error, level=logging.INFO) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.info('Propagate task failed') self.release('error') else: self.complete()
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,github,twitter}' ) elif source_cls in (GooglePlusPage, Instagram): return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!" ) # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error( "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84" ) self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if not code: raise msg = 'Error from %s API or your site: %s %s' % ( self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code, mail=code not in ('502', '503', '504'))
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}' ) elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter( source_cls.domains == domain).fetch(100) if not sources: return self.error( "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % { 'type': source_cls.GR_CLASS.NAME, 'domain': domain }) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. for domain_url in source.domain_urls: if (url.lower().startswith(domain_url.lower().strip('/')) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!' ) content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!" ) # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error( "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!" ) self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def do_send_webmentions(self): unsent = set() for url in self.entity.unsent + self.entity.error: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(url) if ok: # When debugging locally, redirect our own webmentions to localhost if appengine_config.DEBUG and domain in util.LOCALHOST_TEST_DOMAINS: url = url.replace(domain, 'localhost') unsent.add(url) self.entity.unsent = sorted(unsent) self.entity.error = [] while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. domain = util.domain_from_link(target) cache_key = 'W ' + domain cached = memcache.get(cache_key) if cached: logging.info('Using cached webmention endpoint for %s: %s', domain, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) logging.info('Sending...') try: if not mention.send(timeout=999): error = mention.error except: logging.warning('', exc_info=True) error = getattr(mention, 'error', None) if not error: error = {'code': 'EXCEPTION'} if error is None: logging.info('Sent! %s', mention.response) if not self.entity.sent: self.set_last_webmention_sent() self.entity.sent.append(target) memcache.set(cache_key, mention.receiver_endpoint, time=WEBMENTION_DISCOVERY_CACHE_TIME) else: if error['code'] == 'NO_ENDPOINT': logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) memcache.set(cache_key, error, time=WEBMENTION_DISCOVERY_CACHE_TIME) elif (error['code'] == 'BAD_TARGET_URL' and error['http_status'] / 100 == 4): # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.warning('Propagate task failed') self.release('error') else: self.complete()
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.warning('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, a # WebmentionSend error dict if it failed (semi-)permanently, or None. cache_key = util.webmention_endpoint_cache_key(target) cached = memcache.get(cache_key) if cached: logging.info('Using cached webmention endpoint %r: %s', cache_key, cached) # send! and handle response or error error = None if isinstance(cached, dict): error = cached else: mention = send.WebmentionSend(source_url, target, endpoint=cached) logging.info('Sending...') try: if not mention.send(timeout=999, headers=util.USER_AGENT_HEADER): error = mention.error except BaseException, e: logging.warning('', exc_info=True) error = getattr(mention, 'error') if not error: error = ({'code': 'BAD_TARGET_URL', 'http_status': 499} if 'DNS lookup failed for URL:' in str(e) else {'code': 'EXCEPTION'}) if not cached: val = (error if error and error['code'] in ('NO_ENDPOINT', 'BAD_TARGET_URL') else mention.receiver_endpoint) memcache.set(cache_key, val, time=WEBMENTION_DISCOVERY_CACHE_TIME) if error is None: logging.info('Sent! %s', mention.response) self.record_source_webmention(mention) self.entity.sent.append(target) else: code = error['code'] status = error.get('http_status', 0) if (code == 'NO_ENDPOINT' or (code == 'BAD_TARGET_URL' and status == 204)): # 204 is No Content logging.info('Giving up this target. %s', error) self.entity.skipped.append(target) elif status // 100 == 4: # Give up on 4XX errors; we don't expect later retries to succeed. logging.info('Giving up this target. %s', error) self.entity.failed.append(target) else: self.fail('Error sending to endpoint: %s' % error) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target)
def add_original_post_urls(self, post, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post: ActivityStreams post object to get original post URLs from obj: ActivityStreams post object to add original post URLs to prop: string property name in obj to add the original post URLs to """ original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() tags = obj.get('tags', []) for url_list in obj[prop], tags: for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) # if the http version of a link is in upstreams but the https one is just a # mention, or vice versa, promote them both to upstream. # https://github.com/snarfed/bridgy/issues/290 # # TODO: for links that came from resolving redirects above, this doesn't # also catch the initial pre-redirect link. ah well. prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url')) prop_schemeless = set(util.schemeless(url) for url in prop_schemeful) for url_obj in copy.copy(tags): url = url_obj.get('url', '') schemeless = util.schemeless(url) if schemeless in prop_schemeless and url not in prop_schemeful: obj[prop].append(url_obj) tags.remove(url_obj) prop_schemeful.add(url) logging.info('After original post discovery, urls are: %s', seen)
def test_get_webmention_second_redirect_not_text_html(self): self.expect_requests_head( "http://orig", redirected_url=["http://middle", "https://end"], content_type="application/pdf" ) self.mox.ReplayAll() self.assert_equals(("https://end", "end", False), util.get_webmention_target("http://orig", resolve=True))
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds( feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info( 'rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logger.debug( f'previously found relationship(s) for original {permalink}: {synds}' ) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) usynd_urls = {url for url in usynd if isinstance(url, str)} if usynd_urls: logger.debug( f'u-syndication links on the h-feed h-entry: {usynd_urls}') results = _process_syndication_urls(source, permalink, usynd_urls, preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details mf2 = None try: if type_ok: logger.debug(f'fetching post permalink {permalink}') mf2 = util.fetch_mf2(permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logger.info(f'Could not fetch permalink {permalink}', exc_info=True) success = False if mf2: syndication_urls = set() relsynd = mf2['rels'].get('syndication', []) if relsynd: logger.debug(f'rel-syndication links: {relsynd}') syndication_urls.update(url for url in relsynd if isinstance(url, str)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in mf2['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logger.debug(f'u-syndication links: {usynd}') syndication_urls.update(url for url in usynd if isinstance(url, str)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = list(itertools.chain(*results.values())) for syndpost in preexisting: if syndpost.syndication and syndpost not in result_syndposts: logger.info( f'deleting relationship that disappeared: {syndpost}') syndpost.key.delete() preexisting.remove(syndpost) if not results: logger.debug( f'no syndication links from {permalink} to current source {source.label()}.' ) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logger.debug( f'saving empty relationship so that {permalink} will not be searched again' ) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.items(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logger.debug(f'discovered relationships {new_results}') return new_results
def post(self): logging.info('Params: %self', self.request.params.items()) self.source_url = util.get_required_param(self, 'source') self.target_url = util.get_required_param(self, 'target') assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url) except BaseException: return self.error(msg, 'Could not parse target URL %s' % self.target_url) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error('Target must be brid.gy/publish/{facebook,twitter}') elif source_cls in (Instagram, GooglePlusPage): return self.error('Sorry, %s is not yet supported.' % source_cls.AS_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target(self.source_url) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].AS_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # When debugging locally, use snarfed.org for localhost webmentions if appengine_config.DEBUG and domain == 'localhost': domain = 'snarfed.org' # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.AS_CLASS.NAME, 'domain': domain}) for source in sources: logging.info('Source: %s , features %s, status %s' % (source.bridgy_url(self), source.features, source.status)) if source.status == 'enabled' and 'publish' in source.features: self.source = source break else: return self.error( 'Publish is not enabled for your account(s). Please visit %s and sign up!' % ' or '.join(s.bridgy_url(self) for s in sources)) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue try: resp = self.attempt_single_item(item) if self.entity.published: break if resp.abort: return self.error(resp.error_plain, html=resp.error_html, data=item) # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.error( 'Object type(s) %s not supported; error=%s; trying next.', item_types, resp.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = handlers.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Return: a dict of syndicated_url to a list of new models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = BeautifulSoup(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) permalink_to_entry = {} for child in feeditems: if 'h-entry' in child['type']: # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = _process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = util.now_fn() logging.debug('updating source last_syndication_url %s', now) source.updates['last_syndication_url'] = now return results
def do_send_webmentions(self): urls = self.entity.unsent + self.entity.error + self.entity.failed unsent = set() self.entity.error = [] self.entity.failed = [] for orig_url in urls: # recheck the url here since the checks may have failed during the poll # or streaming add. url, domain, ok = util.get_webmention_target(orig_url) if ok: if len(url) <= _MAX_STRING_LENGTH: unsent.add(url) else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, url) self.entity.failed.append(orig_url) self.entity.unsent = sorted(unsent) while self.entity.unsent: target = self.entity.unsent.pop(0) source_url = self.source_url(target) logging.info('Webmention from %s to %s', source_url, target) # see if we've cached webmention discovery for this domain. the cache # value is a string URL endpoint if discovery succeeded, NO_ENDPOINT if # no endpoint was ofund. cache_key = util.webmention_endpoint_cache_key(target) endpoint = util.webmention_endpoint_cache.get(cache_key) if endpoint: logging.info('Using cached webmention endpoint %r: %s', cache_key, endpoint) # send! and handle response or error try: resp = None headers = util.request_headers(source=self.source) if not endpoint: endpoint, resp = webmention.discover(target, headers=headers) with util.webmention_endpoint_cache_lock: util.webmention_endpoint_cache[ cache_key] = endpoint or NO_ENDPOINT if endpoint and endpoint != NO_ENDPOINT: logging.info('Sending...') resp = webmention.send(endpoint, source_url, target, timeout=999, headers=headers) logging.info('Sent! %s', resp) self.record_source_webmention(endpoint, target) self.entity.sent.append(target) else: logging.info('Giving up this target.') self.entity.skipped.append(target) except ValueError: logging.info('Bad URL; giving up this target.') self.entity.skipped.append(target) except BaseException as e: logging.info('', exc_info=True) # Give up on 4XX and DNS errors; we don't expect retries to succeed. code, _ = util.interpret_http_exception(e) if (code and code.startswith('4')) or 'DNS lookup failed' in str(e): logging.info('Giving up this target.') self.entity.failed.append(target) else: self.fail(f'Error sending to endpoint: {resp}', level=logging.INFO) self.entity.error.append(target) if target in self.entity.unsent: self.entity.unsent.remove(target) if self.entity.error: logging.info('Propagate task failed') self.release('error') else: self.complete()
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: self.source = source break else: return self.error( 'Publish is not enabled for your account(s). Please visit %s and sign up!' % ' or '.join(s.bridgy_url(self) for s in sources)) content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = BeautifulSoup(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def _process_entry(source, permalink, feed_entry, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: a list of previously discovered models.SyndicatedPosts for this permalink Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ results = {} # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching blanks and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if refetch_blanks and all(not p.syndication for p in preexisting): logging.debug('ignoring blank relationship for original %s', permalink) else: return results # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink usynd = feed_entry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls(source, permalink, set( url for url in usynd if isinstance(url, basestring))) # fetch the full permalink page, which often has more detailed information if not results: parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) if not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug('saving empty relationship so that it %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) logging.debug('discovered relationships %s', results) return results