def test_follow_redirects(self): self.expect_requests_head('http://will/redirect', redirected_url='http://final/url') self.mox.ReplayAll() self.assert_equals('http://final/url', util.follow_redirects('http://will/redirect').url) # the result should now be in memcache, so we shouldn't fetch the URL again self.assert_equals('http://final/url', util.follow_redirects('http://will/redirect').url)
def canonicalize_syndication_url(self, url, **kwargs): """Follow redirects to find and use profile nicknames instead of ids. ...e.g. +RyanBarrett in https://plus.google.com/+RyanBarrett/posts/JPpA8mApAv2. """ return super(GooglePlusPage, self).canonicalize_syndication_url( util.follow_redirects(url).url)
def _process_syndication_urls(source, permalink, syndication_urls): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new SyndicatedPost in the db. Args: source: a models.Source subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication_urls """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.insert( source, syndication=syndication_url, original=permalink) results.setdefault(syndication_url, []).append(relationship) return results
def test_follow_redirects_with_refresh_header(self): self.expect_requests_head('http://will/redirect', response_headers={'refresh': '0; url=http://refresh'}) self.expect_requests_head('http://refresh', redirected_url='http://final') self.mox.ReplayAll() self.assert_equals('http://final', util.follow_redirects('http://will/redirect').url)
def test_follow_redirects_with_refresh_header(self): headers = {'x': 'y'} self.expect_requests_head('http://will/redirect', headers=headers, response_headers={'refresh': '0; url=http://refresh'}) self.expect_requests_head('http://refresh', headers=headers, redirected_url='http://final') self.mox.ReplayAll() cache = util.CacheDict() self.assert_equals('http://final', util.follow_redirects('http://will/redirect', cache=cache, headers=headers).url)
def discover(source, activity, fetch_hfeed=True): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found SyndicatedPosts but will not do posse-post-discovery to find new ones. Args: source: models.Source subclass. (Immutable! At least mostly. Changes to property values will *not* automatically be stored back in the datastore. last_syndication_url is special-cased in tasks.Poll.) activity: activity dict fetch_hfeed: boolean Return: the activity, updated with original post urls if any are found """ as_source.Source.original_post_discovery(activity) # TODO possible optimization: if we've discovered a backlink to a # post on the author's domain (i.e., it included a link or # citation), then skip the rest of this. # Use source.domain_urls for now; it seems more reliable than the # activity.actor.url (which depends on getting the right data back from # various APIs). Consider using the actor's url, with domain_urls as the # fallback in the future to support content from non-Bridgy users. # # author_url = activity.get('actor', {}).get('url') obj = activity.get('object') or activity author_url = source.get_author_url() syndication_url = obj.get('url') if not author_url: logging.debug('no author url, cannot find h-feed %s', author_url) return activity if not syndication_url: logging.debug('no syndication url, cannot process h-entries %s', syndication_url) return activity # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_syndication_url( util.follow_redirects(syndication_url).url) return _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed)
def test_follow_redirects(self): for i in range(2): self.expect_requests_head('http://will/redirect', redirected_url='http://final/url') self.mox.ReplayAll() cache = util.CacheDict() self.assert_equals( 'http://final/url', util.follow_redirects('http://will/redirect', cache=cache).url) self.assertEquals('http://final/url', cache['R http://will/redirect'].url) # another call without cache should refetch self.assert_equals( 'http://final/url', util.follow_redirects('http://will/redirect').url) # another call with cache shouldn't refetch self.assert_equals( 'http://final/url', util.follow_redirects('http://will/redirect', cache=cache).url)
def discover(source, activity, fetch_hfeed=True): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found SyndicatedPosts but will not do posse-post-discovery to find new ones. Args: source: models.Source subclass. (Immutable! At least mostly. Changes to property values will *not* automatically be stored back in the datastore. last_syndication_url is special-cased in tasks.Poll.) activity: activity dict fetch_hfeed: boolean Return: the activity, updated with original post urls if any are found """ gr_source.Source.original_post_discovery(activity) # TODO possible optimization: if we've discovered a backlink to a # post on the author's domain (i.e., it included a link or # citation), then skip the rest of this. obj = activity.get("object") or activity syndication_url = obj.get("url") if not source.get_author_urls(): logging.debug("no author url(s), cannot find h-feed") return activity if not syndication_url: logging.debug("no syndication url, cannot process h-entries %s", syndication_url) return activity # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_syndication_url(util.follow_redirects(syndication_url).url) return _posse_post_discovery(source, activity, syndication_url, fetch_hfeed)
def discover(source, activity, fetch_hfeed=True, include_redirect_sources=True): """Augments the standard original_post_discovery algorithm with a reverse lookup that supports posts without a backlink or citation. If fetch_hfeed is False, then we will check the db for previously found SyndicatedPosts but will not do posse-post-discovery to find new ones. Args: source: models.Source subclass. Changes to property values (e.g. domains, domain_urls, last_syndication_url) are stored in source.updates; they should be updated transactionally later. activity: activity dict fetch_hfeed: boolean include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs Returns: (set(string original post URLs), set(string mention URLs)) tuple """ if not source.updates: source.updates = {} originals, mentions = gr_source.Source.original_post_discovery( activity, domains=source.domains, cache=memcache, include_redirect_sources=include_redirect_sources, headers=util.USER_AGENT_HEADER) obj = activity.get('object', {}) author_id = obj.get('author', {}).get('id') or activity.get('author', {}).get('id') if author_id and author_id != source.user_tag_id(): logging.info( "Demoting original post links because user %s doesn't match author %s", source.user_tag_id(), author_id) # this is someone else's post, so all links must be mentions mentions.update(originals) originals = set() # look for original URL of attachments (e.g. quote tweets) for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): logging.debug('running original post discovery on attachment: %s', att.get('id')) att_origs, _ = discover( source, att, include_redirect_sources=include_redirect_sources) logging.debug('original post discovery found originals for attachment, %s', att_origs) mentions.update(att_origs) def resolve(urls): resolved = set() for url in urls: final, _, send = util.get_webmention_target(url) if send: resolved.add(final) if include_redirect_sources: resolved.add(url) return resolved originals = resolve(originals) mentions = resolve(mentions) if not source.get_author_urls(): logging.debug('no author url(s), cannot find h-feed') return originals, mentions # TODO possible optimization: if we've discovered a backlink to a post on the # author's domain (i.e., it included a link or citation), then skip the rest # of this. syndication_url = obj.get('url') or activity.get('url') if syndication_url: # use the canonical syndication url on both sides, so that we have # the best chance of finding a match. Some silos allow several # different permalink formats to point to the same place (e.g., # facebook user id instead of user name) syndication_url = source.canonicalize_syndication_url( util.follow_redirects(syndication_url).url) originals.update(_posse_post_discovery( source, activity, syndication_url, fetch_hfeed)) originals = set(util.dedupe_urls(originals)) else: logging.debug('no syndication url, cannot process h-entries') return originals, mentions
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error('Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug('BlogWebmention entity: %s', self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception, e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source!') self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=False) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise
def post(self, source_short_name): logging.info('Params: %s', list(self.request.params.items())) # strip fragments from source and target url self.source_url = urllib.parse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urllib.parse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query().filter( source_cls.domains.IN(domains)).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): return self.error( '%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe().decode()) # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, resp.url), data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, report=False) elif code or body: return self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json_dumps(self.entity.published))
def _process_entry(source, permalink, refetch_blanks, preexisting): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: permalink: url of the unprocessed post syndication_url: url of the syndicated content refetch_blanks: boolean whether we should ignore blank preexisting SyndicatedPosts preexisting: dict of original url to SyndicatedPost Return: a dict from syndicated url to new models.SyndicatedPosts """ results = {} preexisting_relationship = preexisting.get(permalink) # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting_relationship: # if we're refetching blanks and this one is blank, do not return if refetch_blanks and not preexisting_relationship.syndication: logging.debug('ignoring blank relationship for original %s', permalink) else: return results syndication_urls = set() parsed = None try: logging.debug('fetching post permalink %s', permalink) permalink, _, type_ok = util.get_webmention_target(permalink) if type_ok: resp = requests.get(permalink, timeout=HTTP_TIMEOUT) resp.raise_for_status() parsed = mf2py.Parser(url=permalink, doc=resp.text).to_dict() except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) if parsed: relsynd = parsed.get('rels').get('syndication', []) logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(relsynd) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) logging.debug('u-syndication links: %s', usynd) syndication_urls.update(usynd) # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) parsed = urlparse.urlparse(syndication_url) if util.domain_from_link(parsed.netloc) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.get_or_insert_by_syndication_url( source, syndication=syndication_url, original=permalink) results[syndication_url] = relationship if not results: logging.debug('no syndication links from %s to current source %s. ' 'saving empty relationship so that it will not be ' 'searched again', permalink, source.label()) # remember that this post doesn't have syndication links for this # particular source SyndicatedPost(parent=source.key, original=permalink, syndication=None).put() logging.debug('discovered relationships %s', results) return results
def dispatch_request(self, site): logger.info(f'Params: {list(request.values.items())}') # strip fragments from source and target url self.source_url = urllib.parse.urldefrag(request.form['source'])[0] self.target_url = urllib.parse.urldefrag(request.form['target'])[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: self.error(f'Could not parse target URL {self.target_url}') # look up source by domain source_cls = models.sources[site] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: self.error( f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?') # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): msg = 'Home page webmentions are not currently supported.' logger.info(msg) return {'error': msg}, 202 for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}' logger.info(msg) return {'error': msg}, 202 # create BlogWebmention entity id = f'{self.source_url} {self.target_url}' self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported return self.entity.published logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}') # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = f'http://{domain}/' # extract author name and URL from h-card, if any props = item['properties'] author = get_first(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = get_first(author_props, 'name') author_url = get_first(author_props, 'url') # if present, u-url overrides source url u_url = get_first(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>' # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = f'Error: {code}: {e}; {body}' if code == '401': logger.warning(f'Disabling source due to: {e}', exc_info=True) self.source.status = 'disabled' self.source.put() self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): self.error(msg, status=502, report=False) elif code or body: self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() return self.entity.published
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1].get('rels', {}).get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urlparse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error('Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLACKLIST: if pattern.match(target_path): return self.error('%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data.get('items', [])) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))
def test_follow_redirects_defaults_scheme_to_http(self): self.expect_requests_head('http://foo/bar', redirected_url='http://final') self.mox.ReplayAll() self.assert_equals('http://final', util.follow_redirects('foo/bar').url)
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))