Ejemplos de beautifulsoup_parse en Python, ejemplos de util.beautifulsoup_parse en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: facebook_test_live.py Proyecto: frankk00/bridgy

 def submit_form(html):
   """Submits the first form on the page."""
   form = util.beautifulsoup_parse(html).form
   data = {input['name']: input['value'] for input in form.find_all('input')
           if input.get('name') and input.get('value')}
   return facebook.application.get_response(
     form['action'], method=form['method'].upper(), body=urllib.urlencode(data))

Ejemplo n.º 2

0

Mostrar archivo

Archivo: facebook_test_live.py Proyecto: v1cker/bridgy

 def submit_form(html):
   """Submits the first form on the page."""
   form = util.beautifulsoup_parse(html).form
   data = {input['name']: input['value'] for input in form.find_all('input')
           if input.get('name') and input.get('value')}
   return facebook.application.get_response(
     form['action'], method=form['method'].upper(), body=urllib.urlencode(data))

Ejemplo n.º 3

0

Mostrar archivo

Archivo: facebook_test_live.py Proyecto: snarfed/bridgy

 def submit_form(html):
     """Submits the first form on the page."""
     form = util.beautifulsoup_parse(html).form
     data = {
         input["name"]: input["value"]
         for input in form.find_all("input")
         if input.get("name") and input.get("value")
     }
     return facebook.application.get_response(
         form["action"], method=form["method"].upper(), body=urllib.urlencode(data)
     )

Ejemplo n.º 4

0

Mostrar archivo

    def _run(self):
        """Returns CreationResult on success, None otherwise."""
        logging.info('Params: %s', self.request.params.items())
        assert self.PREVIEW in (True, False)

        # parse and validate target URL
        try:
            parsed = urlparse.urlparse(self.target_url())
        except BaseException:
            return self.error('Could not parse target URL %s' %
                              self.target_url())

        domain = parsed.netloc
        path_parts = parsed.path.rsplit('/', 1)
        source_cls = SOURCE_NAMES.get(path_parts[-1])
        if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080')
                or len(path_parts) != 2 or path_parts[0] != '/publish'
                or not source_cls):
            return self.error(
                'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}'
            )
        elif source_cls == GooglePlusPage:
            return self.error('Sorry, %s is not yet supported.' %
                              source_cls.GR_CLASS.NAME)

        # resolve source URL
        url, domain, ok = util.get_webmention_target(
            self.source_url(), replace_test_domains=False)
        # show nice error message if they're trying to publish a silo post
        if domain in SOURCE_DOMAINS:
            return self.error(
                "Looks like that's a %s URL. Try one from your web site instead!"
                % SOURCE_DOMAINS[domain].GR_CLASS.NAME)
        elif not ok:
            return self.error('Unsupported source URL %s' % url)
        elif not domain:
            return self.error('Could not parse source URL %s' % url)

        # look up source by domain
        domain = domain.lower()
        sources = source_cls.query().filter(
            source_cls.domains == domain).fetch(100)
        if not sources:
            return self.error(
                "Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again."
                % {
                    'type': source_cls.GR_CLASS.NAME,
                    'domain': domain
                })

        current_url = ''
        for source in sources:
            logging.info('Source: %s , features %s, status %s, poll status %s',
                         source.bridgy_url(self), source.features,
                         source.status, source.poll_status)
            if source.status != 'disabled' and 'publish' in source.features:
                # use a source that has a domain_url matching the url provided.
                # look through each source to find the one with the closest match.
                for domain_url in source.domain_urls:
                    if (url.lower().startswith(domain_url.lower().strip('/'))
                            and len(domain_url) > len(current_url)):
                        self.source = source
                        current_url = domain_url

        if not self.source:
            return self.error(
                'Publish is not enabled for your account. Please visit https://brid.gy and sign up!'
            )

        content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
        if content_param in self.request.params:
            return self.error('The %s parameter is not supported' %
                              content_param)

        # show nice error message if they're trying to publish their home page
        for domain_url in self.source.domain_urls:
            domain_url_parts = urlparse.urlparse(domain_url)
            source_url_parts = urlparse.urlparse(self.source_url())
            if (source_url_parts.netloc == domain_url_parts.netloc
                    and source_url_parts.path.strip('/')
                    == domain_url_parts.path.strip('/')
                    and not source_url_parts.query):
                return self.error(
                    "Looks like that's your home page. Try one of your posts instead!"
                )

        # done with the sanity checks, ready to fetch the source url. create the
        # Publish entity so we can store the result.
        entity = self.get_or_add_publish_entity(url)
        if (entity.status == 'complete' and entity.type != 'preview'
                and not self.PREVIEW and not appengine_config.DEBUG):
            return self.error(
                "Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!"
            )
        self.entity = entity

        # fetch source page
        resp = self.fetch_mf2(url)
        if not resp:
            return
        self.fetched, data = resp

        # find rel-shortlink, if any
        # http://microformats.org/wiki/rel-shortlink
        # https://github.com/snarfed/bridgy/issues/173
        soup = util.beautifulsoup_parse(self.fetched.text)
        shortlinks = (soup.find_all('link', rel='shortlink') +
                      soup.find_all('a', rel='shortlink') +
                      soup.find_all('a', class_='shortlink'))
        if shortlinks:
            self.shortlink = shortlinks[0]['href']

        # loop through each item and its children and try to preview/create it. if
        # it fails, try the next one. break after the first one that works.
        resp = None
        types = set()
        queue = collections.deque(data.get('items', []))
        while queue:
            item = queue.popleft()
            item_types = set(item.get('type'))
            if 'h-feed' in item_types and 'h-entry' not in item_types:
                queue.extend(item.get('children', []))
                continue
            elif not item_types & PUBLISHABLE_TYPES:
                continue

            try:
                result = self.attempt_single_item(item)
                if self.entity.published:
                    break
                if result.abort:
                    if result.error_plain:
                        self.error(result.error_plain,
                                   html=result.error_html,
                                   data=item)
                    return
                # try the next item
                for embedded in ('rsvp', 'invitee', 'repost', 'repost-of',
                                 'like', 'like-of', 'in-reply-to'):
                    if embedded in item.get('properties', []):
                        item_types.add(embedded)
                logging.info(
                    'Object type(s) %s not supported; error=%s; trying next.',
                    item_types, result.error_plain)
                types = types.union(item_types)
                queue.extend(item.get('children', []))
            except BaseException, e:
                code, body = util.interpret_http_exception(e)
                return self.error('Error: %s %s' % (body or '', e),
                                  status=code or 500,
                                  mail=True)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: webmention.py Proyecto: murindwaz/bridgy

    def fetch_mf2(self, url, require_mf2=True, raise_errors=False):
        """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string
      require_mf2: boolean, whether to return error if no mf2 are found
      raise_errors: boolean, whether to let error exceptions propagate up or
        handle them

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
        try:
            fetched = util.requests_get(url)
            fetched.raise_for_status()
        except BaseException as e:
            if raise_errors:
                raise
            util.interpret_http_exception(e)  # log exception
            return self.error('Could not fetch source URL %s' % url)

        if self.entity:
            self.entity.html = fetched.text

        # .text is decoded unicode string, .content is raw bytes. if the HTTP
        # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
        # can look for a <meta> tag with a charset and decode.
        text = (fetched.text if 'charset' in fetched.headers.get(
            'content-type', '') else fetched.content)
        doc = util.beautifulsoup_parse(text)

        # parse microformats
        data = util.mf2py_parse(doc, fetched.url)

        # special case tumblr's markup: div#content > div.post > div.copy
        # convert to mf2 and re-parse
        if not data.get('items'):
            contents = doc.find_all(id='content')
            if contents:
                post = contents[0].find_next(class_='post')
                if post:
                    post['class'] = 'h-entry'
                    copy = post.find_next(class_='copy')
                    if copy:
                        copy['class'] = 'e-content'
                    photo = post.find_next(class_='photo-wrapper')
                    if photo:
                        img = photo.find_next('img')
                        if img:
                            img['class'] = 'u-photo'
                    doc = unicode(post)
                    data = util.mf2py_parse(doc, fetched.url)

        logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
        items = data.get('items', [])
        if require_mf2 and (not items or not items[0]):
            return self.error('No microformats2 data found in ' + fetched.url,
                              data=data,
                              html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

        return fetched, data

Ejemplo n.º 6

0

Mostrar archivo

  def fetch_mf2(self, url):
    """Fetches a URL and extracts its mf2 data.

    Side effects: sets :attr:`entity`\ .html on success, calls :attr:`error()`
    on errors.

    Args:
      url: string

    Returns:
      (:class:`requests.Response`, mf2 data dict) on success, None on failure
    """
    try:
      fetched = util.requests_get(url)
      fetched.raise_for_status()
    except BaseException as e:
      util.interpret_http_exception(e)  # log exception
      return self.error('Could not fetch source URL %s' % url)

    if self.entity:
      self.entity.html = fetched.text

    # .text is decoded unicode string, .content is raw bytes. if the HTTP
    # headers didn't specify a charset, pass raw bytes to BeautifulSoup so it
    # can look for a <meta> tag with a charset and decode.
    text = (fetched.text if 'charset' in fetched.headers.get('content-type', '')
            else fetched.content)
    doc = util.beautifulsoup_parse(text)

    # parse microformats, convert to ActivityStreams
    data = util.mf2py_parse(doc, fetched.url)

    # special case tumblr's markup: div#content > div.post > div.copy
    # convert to mf2 and re-parse
    if not data.get('items'):
      contents = doc.find_all(id='content')
      if contents:
        post = contents[0].find_next(class_='post')
        if post:
          post['class'] = 'h-entry'
          copy = post.find_next(class_='copy')
          if copy:
            copy['class'] = 'e-content'
          photo = post.find_next(class_='photo-wrapper')
          if photo:
            img = photo.find_next('img')
            if img:
              img['class'] = 'u-photo'
          doc = unicode(post)
          data = util.mf2py_parse(doc, fetched.url)

    logging.debug('Parsed microformats2: %s', json.dumps(data, indent=2))
    items = data.get('items', [])
    if not items or not items[0]:
      return self.error('No microformats2 data found in ' + fetched.url,
                        data=data, html="""
No <a href="http://microformats.org/get-started">microformats</a> or
<a href="http://microformats.org/wiki/microformats2">microformats2</a> found in
<a href="%s">%s</a>! See <a href="http://indiewebify.me/">indiewebify.me</a>
for details (skip to level 2, <em>Publishing on the IndieWeb</em>).
""" % (fetched.url, util.pretty_link(fetched.url)))

    return fetched, data

Ejemplo n.º 7

0

Mostrar archivo

Archivo: original_post_discovery.py Proyecto: snarfed/bridgy

def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = util.beautifulsoup_parse(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.info('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if feed_type and feed_type != 'text/html':
      feed_ok = False
    else:
      # double check that it's text/html, not too big, etc
      feed_url, _, feed_ok = util.get_webmention_target(feed_url)

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True)

  # sort by dt-updated/dt-published
  def updated_or_published(item):
    props = microformats2.first_props(item.get('properties'))
    return props.get('updated') or props.get('published')

  feeditems.sort(key=updated_or_published, reverse=True)

  permalink_to_entry = collections.OrderedDict()
  for child in feeditems:
    if 'h-entry' in child['type']:
      permalinks = child['properties'].get('url', [])
      if not permalinks:
        logging.debug('ignoring h-entry with no u-url!')
      for permalink in permalinks:
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

    max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user()
           else MAX_PERMALINK_FETCHES)
    if len(permalink_to_entry) >= max:
      logging.info('Hit cap of %d permalinks. Stopping.', max)
      break

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    source.updates['last_syndication_url'] = util.now_fn()

  return results

Ejemplo n.º 8

0

Mostrar archivo

def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    try:
        logging.debug('fetching author url %s', author_url)
        author_resp = util.requests_get(author_url)
        # TODO for error codes that indicate a temporary error, should we make
        # a certain number of retries before giving up forever?
        author_resp.raise_for_status()
        author_dom = util.beautifulsoup_parse(author_resp.text)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logging.info('Could not fetch author url %s',
                     author_url,
                     exc_info=True)
        return {}

    feeditems = _find_feed_items(author_url, author_dom)

    # look for all other feed urls using rel='feed', type='text/html'
    feed_urls = set()
    for rel_feed_node in (author_dom.find_all('link', rel='feed') +
                          author_dom.find_all('a', rel='feed')):
        feed_url = rel_feed_node.get('href')
        if not feed_url:
            continue

        feed_url = urlparse.urljoin(author_url, feed_url)
        feed_type = rel_feed_node.get('type')
        if feed_type and feed_type != 'text/html':
            feed_ok = False
        else:
            # double check that it's text/html, not too big, etc
            feed_url, _, feed_ok = util.get_webmention_target(feed_url)

        if feed_url == author_url:
            logging.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logging.debug('skipping feed of type %s', feed_type)
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logging.debug("fetching author's rel-feed %s", feed_url)
            feed_resp = util.requests_get(feed_url)
            feed_resp.raise_for_status()
            logging.debug("author's rel-feed fetched successfully %s",
                          feed_url)
            feeditems = _merge_hfeeds(
                feeditems, _find_feed_items(feed_url, feed_resp.text))

            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logging.info(
                        'rel-feed found new domain %s! adding to source',
                        domain)
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logging.info('Could not fetch h-feed url %s.',
                         feed_url,
                         exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published')

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logging.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, basestring):
                    permalink_to_entry[permalink] = child
                else:
                    logging.warn('unexpected non-string "url" property: %s',
                                 permalink)

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logging.info('Hit cap of %d permalinks. Stopping.', max)
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.iteritems():
        logging.debug('processing permalink: %s', permalink)
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.iteritems():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results

Ejemplo n.º 9

0

Mostrar archivo

  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})

    current_url = ''
    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        # use a source that has a domain_url matching the url provided.
        # look through each source to find the one with the closest match.
        schemeless_url = util.schemeless(url.lower()).strip('/')
        for domain_url in source.domain_urls:
          schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/')
          if (schemeless_url.startswith(schemeless_domain_url) and
              len(domain_url) > len(current_url)):
            self.source = source
            current_url = domain_url

    if not self.source:
      return self.error(
        'Publish is not enabled for your account. Please visit https://brid.gy and sign up!')

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      source_url_parts = urlparse.urlparse(self.source_url())
      if (source_url_parts.netloc == domain_url_parts.netloc and
          source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and
          not source_url_parts.query):
        return self.error(
          "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        mail = True
        if (not code or code == 500) and util.is_connection_failure(e):
          code = 502
          mail=False
        msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e)
        return self.error(msg, status=code or 500, mail=mail)

Ejemplo n.º 10

0

Mostrar archivo

    def _run(self):
        """Returns CreationResult on success, None otherwise."""
        logging.info('Params: %s', self.request.params.items())
        assert self.PREVIEW in (True, False)

        # parse and validate target URL
        try:
            parsed = urlparse.urlparse(self.target_url())
        except BaseException:
            return self.error('Could not parse target URL %s' %
                              self.target_url())

        domain = parsed.netloc
        path_parts = parsed.path.rsplit('/', 1)
        source_cls = SOURCE_NAMES.get(path_parts[-1])
        if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080')
                or len(path_parts) != 2 or path_parts[0] != '/publish'
                or not source_cls):
            return self.error(
                'Target must be brid.gy/publish/{facebook,flickr,github,twitter}'
            )
        elif source_cls == Instagram:
            return self.error('Sorry, %s is not supported.' %
                              source_cls.GR_CLASS.NAME)

        # resolve source URL
        url, domain, ok = util.get_webmention_target(
            self.source_url(), replace_test_domains=False)
        # show nice error message if they're trying to publish a silo post
        if domain in SOURCE_DOMAINS:
            return self.error(
                "Looks like that's a %s URL. Try one from your web site instead!"
                % SOURCE_DOMAINS[domain].GR_CLASS.NAME)
        elif not ok:
            return self.error('Unsupported source URL %s' % url)
        elif not domain:
            return self.error('Could not parse source URL %s' % url)

        # look up source by domain
        self.source = self._find_source(source_cls, url, domain)
        if not self.source:
            return  # _find_source rendered the error

        content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
        if content_param in self.request.params:
            return self.error('The %s parameter is not supported' %
                              content_param)

        # show nice error message if they're trying to publish their home page
        for domain_url in self.source.domain_urls:
            domain_url_parts = urlparse.urlparse(domain_url)
            for source_url in url, self.source_url():
                parts = urlparse.urlparse(source_url)
                if (parts.netloc == domain_url_parts.netloc
                        and parts.path.strip('/')
                        == domain_url_parts.path.strip('/')
                        and not parts.query):
                    return self.error(
                        "Looks like that's your home page. Try one of your posts instead!"
                    )

        # done with the sanity checks, ready to fetch the source url. create the
        # Publish entity so we can store the result.
        self.entity = self.get_or_add_publish_entity(url)
        try:
            resp = self.fetch_mf2(url, raise_errors=True)
        except BaseException as e:
            status, body = util.interpret_http_exception(e)
            if status == '410':
                return self.delete(url)
            return self.error('Could not fetch source URL %s' % url)

        if not resp:
            return
        self.fetched, data = resp

        # create the Publish entity so we can store the result.
        if (self.entity.status == 'complete' and self.entity.type != 'preview'
                and not self.PREVIEW and not appengine_config.DEBUG):
            return self.error(
                "Sorry, you've already published that page, and Bridgy Publish doesn't support updating existing posts. Details: https://github.com/snarfed/bridgy/issues/84",
                extra_json={'original': self.entity.published})

        # find rel-shortlink, if any
        # http://microformats.org/wiki/rel-shortlink
        # https://github.com/snarfed/bridgy/issues/173
        soup = util.beautifulsoup_parse(self.fetched.text)
        shortlinks = (soup.find_all('link', rel='shortlink') +
                      soup.find_all('a', rel='shortlink') +
                      soup.find_all('a', class_='shortlink'))
        if shortlinks:
            self.shortlink = urllib.parse.urljoin(url, shortlinks[0]['href'])

        # loop through each item and its children and try to preview/create it. if
        # it fails, try the next one. break after the first one that works.
        result = None
        types = set()
        queue = collections.deque(data.get('items', []))
        while queue:
            item = queue.popleft()
            item_types = set(item.get('type'))
            if 'h-feed' in item_types and 'h-entry' not in item_types:
                queue.extend(item.get('children', []))
                continue
            elif not item_types & PUBLISHABLE_TYPES:
                types = types.union(item_types)
                continue

            try:
                result = self.attempt_single_item(item)
                if self.entity.published:
                    break
                if result.abort:
                    if result.error_plain:
                        self.error(result.error_plain,
                                   html=result.error_html,
                                   data=item)
                    return
                # try the next item
                for embedded in ('rsvp', 'invitee', 'repost', 'repost-of',
                                 'like', 'like-of', 'in-reply-to'):
                    if embedded in item.get('properties', []):
                        item_types.add(embedded)
                logging.info(
                    'Object type(s) %s not supported; error=%s; trying next.',
                    item_types, result.error_plain)
                types = types.union(item_types)
                queue.extend(item.get('children', []))
            except BaseException, e:
                code, body = util.interpret_http_exception(e)
                if code in self.source.DISABLE_HTTP_CODES or isinstance(
                        e, models.DisableSource):
                    # the user deauthorized the bridgy app, or the token expired, so
                    # disable this source.
                    logging.warning('Disabling source due to: %s' % e,
                                    exc_info=True)
                    self.source.status = 'disabled'
                    self.source.put()
                    # TODO: eventually drop this to just if source.is_beta_user(). leaving
                    # for everyone right now for initial monitoring.
                    util.email_me(subject='Bridgy Publish: disabled %s' %
                                  self.source.label(),
                                  body=body)
                if isinstance(
                        e,
                    (NotImplementedError, ValueError, urllib2.URLError)):
                    code = '400'
                elif not code:
                    raise
                msg = 'Error: %s %s' % (body or '', e)
                return self.error(msg,
                                  status=code,
                                  mail=code
                                  not in ('400', '404', '502', '503', '504'))

Ejemplo n.º 11

0

Mostrar archivo

Archivo: publish.py Proyecto: mblaney/bridgy

  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    self.source = self._find_source(source_cls, url, domain)
    if not self.source:
      return  # _find_source rendered the error

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      for source_url in url, self.source_url():
        parts = urlparse.urlparse(source_url)
        if (parts.netloc == domain_url_parts.netloc and
            parts.path.strip('/') == domain_url_parts.path.strip('/') and
            not parts.query):
          return self.error(
            "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Details: https://github.com/snarfed/bridgy/issues/84")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        types = types.union(item_types)
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        if not code:
          raise
        msg = 'Error from %s API or your site: %s %s' % (
          self.source.GR_CLASS.NAME, body or '', e)
        return self.error(msg, status=code, mail=code not in ('502', '503', '504'))