def add_original_post_urls(self, post_id, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post_id: string post id obj: ActivityStreams post object prop: string property name in obj to add the original post URLs to """ post = None try: post = self.source.get_post(post_id) except: logging.warning('Error fetching source post %s', post_id, exc_info=True) return if not post: logging.warning('Source post %s not found', post_id) return original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() for url_list in obj[prop], obj.get('tags', []): for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) logging.info('After original post discovery, urls are: %s', seen)
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_webmention_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error('Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception, e: code, body = util.interpret_http_exception(e) if code or body: return self.error('Error: %s %s; %s' % (code, e, body), status=code, mail=True) else: raise
def add_original_post_urls(self, post, obj, prop): """Extracts original post URLs and adds them to an object, in place. If the post object has upstreamDuplicates, *only* they are considered original post URLs and added as tags with objectType 'article', and the post's own links and 'article' tags are added with objectType 'mention'. Args: post: ActivityStreams post object to get original post URLs from obj: ActivityStreams post object to add original post URLs to prop: string property name in obj to add the original post URLs to """ original_post_discovery.discover(self.source, post, fetch_hfeed=False) tags = [tag for tag in post['object'].get('tags', []) if 'url' in tag and tag['objectType'] == 'article'] upstreams = post['object'].get('upstreamDuplicates', []) if not isinstance(obj.setdefault(prop, []), list): obj[prop] = [obj[prop]] if upstreams: obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams] obj.setdefault('tags', []).extend( [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags]) else: obj[prop] += tags # check for redirects, and if there are any follow them and add final urls # in addition to the initial urls. seen = set() tags = obj.get('tags', []) for url_list in obj[prop], tags: for url_obj in url_list: url = util.clean_webmention_url(url_obj.get('url', '')) if not url or url in seen: continue seen.add(url) # when debugging locally, replace my (snarfed.org) URLs with localhost url_obj['url'] = url = util.replace_test_domains_with_localhost(url) resolved, _, send = util.get_webmention_target(url) if send and resolved != url and resolved not in seen: seen.add(resolved) url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')}) # if the http version of a link is in upstreams but the https one is just a # mention, or vice versa, promote them both to upstream. # https://github.com/snarfed/bridgy/issues/290 # # TODO: for links that came from resolving redirects above, this doesn't # also catch the initial pre-redirect link. ah well. prop_schemeful = set(tag['url'] for tag in obj[prop] if tag.get('url')) prop_schemeless = set(util.schemeless(url) for url in prop_schemeful) for url_obj in copy.copy(tags): url = url_obj.get('url', '') schemeless = util.schemeless(url) if schemeless in prop_schemeless and url not in prop_schemeful: obj[prop].append(url_obj) tags.remove(url_obj) prop_schemeful.add(url) logging.info('After original post discovery, urls are: %s', seen)