def submit_form(html): """Submits the first form on the page.""" form = util.beautifulsoup_parse(html).form data = {input['name']: input['value'] for input in form.find_all('input') if input.get('name') and input.get('value')} return facebook.application.get_response( form['action'], method=form['method'].upper(), body=urllib.urlencode(data))
def submit_form(html): """Submits the first form on the page.""" form = util.beautifulsoup_parse(html).form data = { input["name"]: input["value"] for input in form.find_all("input") if input.get("name") and input.get("value") } return facebook.application.get_response( form["action"], method=form["method"].upper(), body=urllib.urlencode(data) )
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}' ) elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter( source_cls.domains == domain).fetch(100) if not sources: return self.error( "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % { 'type': source_cls.GR_CLASS.NAME, 'domain': domain }) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. for domain_url in source.domain_urls: if (url.lower().startswith(domain_url.lower().strip('/')) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!' ) content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!" ) # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error( "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!" ) self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. resp = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) return self.error('Error: %s %s' % (body or '', e), status=code or 500, mail=True)
def fetch_mf2(self, url, require_mf2=True, raise_errors=False): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string require_mf2: boolean, whether to return error if no mf2 are found raise_errors: boolean, whether to let error exceptions propagate up or handle them Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: if raise_errors: raise util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get( 'content-type', '') else fetched.content) doc = util.beautifulsoup_parse(text) # parse microformats data = util.mf2py_parse(doc, fetched.url) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not data.get('items'): contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) data = util.mf2py_parse(doc, fetched.url) logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if require_mf2 and (not items or not items[0]): return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def fetch_mf2(self, url): """Fetches a URL and extracts its mf2 data. Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()` on errors. Args: url: string Returns: (:class:`requests.Response`, mf2 data dict) on success, None on failure """ try: fetched = util.requests_get(url) fetched.raise_for_status() except BaseException as e: util.interpret_http_exception(e) # log exception return self.error('Could not fetch source URL %s' % url) if self.entity: self.entity.html = fetched.text # .text is decoded unicode string, .content is raw bytes. if the HTTP # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it # can look for a <meta> tag with a charset and decode. text = (fetched.text if 'charset' in fetched.headers.get('content-type', '') else fetched.content) doc = util.beautifulsoup_parse(text) # parse microformats, convert to ActivityStreams data = util.mf2py_parse(doc, fetched.url) # special case tumblr's markup: div#content > div.post > div.copy # convert to mf2 and re-parse if not data.get('items'): contents = doc.find_all(id='content') if contents: post = contents[0].find_next(class_='post') if post: post['class'] = 'h-entry' copy = post.find_next(class_='copy') if copy: copy['class'] = 'e-content' photo = post.find_next(class_='photo-wrapper') if photo: img = photo.find_next('img') if img: img['class'] = 'u-photo' doc = unicode(post) data = util.mf2py_parse(doc, fetched.url) logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2)) items = data.get('items', []) if not items or not items[0]: return self.error('No microformats2 data found in ' + fetched.url, data=data, html=""" No <a href="http://microformats.org/get-started">microformats</a> or <a href="http://microformats.org/wiki/microformats2">microformats2</a> found in <a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a> for details (skip to level 2, <em>Publishing on the IndieWeb</em>). """ % (fetched.url, util.pretty_link(fetched.url))) return fetched, data
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds( feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info( 'rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!') content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) mail = True if (not code or code == 500) and util.is_connection_failure(e): code = 502 mail=False msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code or 500, mail=mail)
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,github,twitter}' ) elif source_cls == Instagram: return self.error('Sorry, %s is not supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!" ) # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. self.entity = self.get_or_add_publish_entity(url) try: resp = self.fetch_mf2(url, raise_errors=True) except BaseException as e: status, body = util.interpret_http_exception(e) if status == '410': return self.delete(url) return self.error('Could not fetch source URL %s' % url) if not resp: return self.fetched, data = resp # create the Publish entity so we can store the result. if (self.entity.status == 'complete' and self.entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error( "Sorry, you've already published that page, and Bridgy Publish doesn't support updating existing posts. Details: https://github.com/snarfed/bridgy/issues/84", extra_json={'original': self.entity.published}) # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = urllib.parse.urljoin(url, shortlinks[0]['href']) # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if code in self.source.DISABLE_HTTP_CODES or isinstance( e, models.DisableSource): # the user deauthorized the bridgy app, or the token expired, so # disable this source. logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() # TODO: eventually drop this to just if source.is_beta_user(). leaving # for everyone right now for initial monitoring. util.email_me(subject='Bridgy Publish: disabled %s' % self.source.label(), body=body) if isinstance( e, (NotImplementedError, ValueError, urllib2.URLError)): code = '400' elif not code: raise msg = 'Error: %s %s' % (body or '', e) return self.error(msg, status=code, mail=code not in ('400', '404', '502', '503', '504'))
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain self.source = self._find_source(source_cls, url, domain) if not self.source: return # _find_source rendered the error content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) for source_url in url, self.source_url(): parts = urlparse.urlparse(source_url) if (parts.netloc == domain_url_parts.netloc and parts.path.strip('/') == domain_url_parts.path.strip('/') and not parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: types = types.union(item_types) continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) if not code: raise msg = 'Error from %s API or your site: %s %s' % ( self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code, mail=code not in ('502', '503', '504'))