Example #1
0
def requests_get(url, **kwargs):
  """Wraps :func:`requests.get` with extra semantics and our user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to :func:`requests.get` so that it doesn't fetch the response body
  until we access :attr:`requests.Response.content` (or
  :attr:`requests.Response.text`).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
  if url in URL_BLACKLIST:
    resp = requests.Response()
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
    return resp

  kwargs.setdefault('headers', {}).update(request_headers(url=url))
  resp = util.requests_get(url, stream=True, **kwargs)

  length = resp.headers.get('Content-Length', 0)
  if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE:
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = ('Content-Length %s is larger than our limit %s.' %
                                  (length, MAX_HTTP_RESPONSE_SIZE))

  return resp
Example #2
0
    def upload_media(self, media):
        """Uploads one or more images or videos from web URLs.

    https://docs.joinmastodon.org/api/rest/media/

    Args:
      media: sequence of AS image or stream objects, eg:
        [{'url': 'http://picture', 'displayName': 'a thing'}, ...]

    Returns: list of string media ids for uploaded files
    """
        uploaded = set()  # URLs uploaded so far; for de-duping
        ids = []

        for obj in media:
            url = util.get_url(obj, key='stream') or util.get_url(obj)
            if not url or url in uploaded:
                continue

            data = {}
            alt = obj.get('displayName')
            if alt:
                data['description'] = util.ellipsize(alt, chars=MAX_ALT_LENGTH)

            # TODO: mime type check?
            with util.requests_get(url, stream=True) as fetch:
                fetch.raise_for_status()
                upload = self._post(API_MEDIA, files={'file': fetch.raw})

            logging.info('Got: %s', upload)
            media_id = upload['id']
            ids.append(media_id)
            uploaded.add(url)

        return ids
Example #3
0
def requests_get(url, **kwargs):
    """Wraps requests.get with extra semantics and our user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to requests.get so that it doesn't fetch the response body until
  we access response.content (or .text).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
    if url in URL_BLACKLIST:
        resp = requests.Response()
        resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
        resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
        return resp

    kwargs.setdefault('headers', {}).update(USER_AGENT_HEADER)
    resp = util.requests_get(url, stream=True, **kwargs)

    length = resp.headers.get('Content-Length', 0)
    if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE:
        resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
        resp._text = resp._content = (
            'Content-Length %s is larger than our limit %s.' %
            (length, MAX_HTTP_RESPONSE_SIZE))

    return resp
Example #4
0
  def _fetch(self, url):
    """Fetches url and returns (string final url, unicode body)."""
    try:
      resp = util.requests_get(url, stream=True)
    except (ValueError, requests.URLRequired, requests.TooManyRedirects) as e:
      self.abort(400, str(e))
      # other exceptions are handled by webutil.handlers.handle_exception(),
      # which uses interpret_http_exception(), etc.

    if url != resp.url:
      url = resp.url
      logging.info('Redirected to %s', url)

    body = resp.text

    length = resp.headers.get('Content-Length')
    if util.is_int(length):
      length = int(length)
    if not length:
      length = len(body)
    if length > MAX_HTTP_RESPONSE_SIZE:
      self.abort(HTTP_RESPONSE_TOO_BIG_STATUS_CODE,
                 'Content-Length %s for %s is larger than our limit of %s.' %
                 (length, url, MAX_HTTP_RESPONSE_SIZE))

    return url, body
Example #5
0
    def _fetch(self, url):
        """Fetches url and returns (string final url, unicode body)."""
        try:
            resp = util.requests_get(url, stream=True)
        except (ValueError, requests.URLRequired,
                requests.TooManyRedirects) as e:
            self.abort(400, str(e))
            # other exceptions are handled by webutil.handlers.handle_exception(),
            # which uses interpret_http_exception(), etc.

        if url != resp.url:
            url = resp.url
            logging.info('Redirected to %s', url)

        body = resp.text

        length = resp.headers.get('Content-Length')
        if util.is_int(length):
            length = int(length)
        if not length:
            length = len(body)
        if length > MAX_HTTP_RESPONSE_SIZE:
            self.abort(
                HTTP_RESPONSE_TOO_BIG_STATUS_CODE,
                'Content-Length %s for %s is larger than our limit of %s.' %
                (length, url, MAX_HTTP_RESPONSE_SIZE))

        return url, body
Example #6
0
  def rest(self, url, data=None, parse_json=True, **kwargs):
    """Makes a v3 REST API call.

    Uses HTTP POST if data is provided, otherwise GET.

    Args:
      data: dict, JSON payload for POST requests
      json: boolean, whether to parse the response body as JSON and return it as a dict. If False, returns a :class:`requests.Response` instead.

    Returns: dict decoded from JSON response if json=True, otherwise :class:`requests.Response`
    """
    kwargs['headers'] = kwargs.get('headers') or {}
    kwargs['headers'].update({
      'Authorization': 'token %s' % self.access_token,
      # enable the beta Reactions API
      # https://developer.github.com/v3/reactions/
      'Accept': 'application/vnd.github.squirrel-girl-preview+json',
    })

    if data is None:
      resp = util.requests_get(url, **kwargs)
    else:
      resp = util.requests_post(url, json=data, **kwargs)
    resp.raise_for_status()

    return json_loads(resp.text) if parse_json else resp
Example #7
0
    def _fetch(self, url):
        """Fetches url and returns (string final url, unicode body)."""
        try:
            resp = util.requests_get(url)
        except (ValueError, requests.URLRequired) as e:
            self.abort(400, str(e))
            # other exceptions are handled by webutil.handlers.handle_exception(),
            # which uses interpret_http_exception(), etc.

        if url != resp.url:
            url = resp.url
            logging.info('Redirected to %s', url)
        body = resp.text

        return url, body
Example #8
0
def requests_get(url, **kwargs):
    """Wraps :func:`requests.get` with extra semantics and our user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to :func:`requests.get` so that it doesn't fetch the response body
  until we access :attr:`requests.Response.content` (or
  :attr:`requests.Response.text`).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
    if url in URL_BLACKLIST:
        resp = requests.Response()
        resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
        resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
        return resp

    kwargs.setdefault('headers', {}).update(request_headers(url=url))
    return util.requests_get(url, **kwargs)
Example #9
0
    def _scrape_json(url, cookie=None):
        """Fetches and returns JSON from www.instagram.com."""
        headers = {}
        if cookie:
            if not cookie.startswith('sessionid='):
                cookie = 'sessionid=' + cookie
            headers = {'Cookie': cookie}

        resp = util.requests_get(url, allow_redirects=False, headers=headers)
        resp.raise_for_status()

        try:
            return resp.json()
        except ValueError as e:
            msg = "Couldn't decode response as JSON:\n%s" % resp.text
            logging.exception(msg)
            resp.status_code = 504
            raise requests.HTTPError('504 Bad response from Instagram\n' + msg,
                                     response=resp)
Example #10
0
  def rest(self, url, data=None, **kwargs):
    """Makes a v3 REST API call.

    Uses HTTP POST if data is provided, otherwise GET.

    Args:
      data: dict, JSON payload for POST requests

    Returns: `requests.Response`
    """
    kwargs['headers'] = kwargs.get('headers') or {}
    kwargs['headers'].update({
      'Authorization': 'token %s' % self.access_token,
      # enable the beta Reactions API
      # https://developer.github.com/v3/reactions/
      'Accept': 'application/vnd.github.squirrel-girl-preview+json',
    })

    if data is None:
      resp = util.requests_get(url, **kwargs)
    else:
      resp = util.requests_post(url, json=data, **kwargs)
    resp.raise_for_status()
    return resp
Example #11
0
 def fetch(url):
     return mf2py.parse(util.requests_get(url).text, url=url)
Example #12
0
 def fetch(url):
   return mf2py.parse(util.requests_get(url).text, url=url, img_with_alt=True)
Example #13
0
    def get(self):
        input = util.get_required_param(self, 'input')
        if input not in INPUTS:
            raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' %
                                     (input, INPUTS))

        orig_url = util.get_required_param(self, 'url')
        fragment = urllib.parse.urlparse(orig_url).fragment
        if fragment and input != 'html':
            raise exc.HTTPBadRequest(
                'URL fragments only supported with input=html.')

        resp = util.requests_get(orig_url, gateway=True)
        final_url = resp.url

        # decode data
        if input in ('activitystreams', 'as1', 'as2', 'mf2-json', 'json-mf2',
                     'jsonfeed'):
            try:
                body_json = json_loads(resp.text)
                body_items = (body_json if isinstance(body_json, list) else
                              body_json.get('items') or [body_json])
            except (TypeError, ValueError):
                raise exc.HTTPBadRequest('Could not decode %s as JSON' %
                                         final_url)

        mf2 = None
        if input == 'html':
            mf2 = util.parse_mf2(resp, id=fragment)
            if id and not mf2:
                raise exc.HTTPBadRequest(
                    'Got fragment %s but no element found with that id.' %
                    fragment)
        elif input in ('mf2-json', 'json-mf2'):
            mf2 = body_json
            if not hasattr(mf2, 'get'):
                raise exc.HTTPBadRequest(
                    'Expected microformats2 JSON input to be dict, got %s' %
                    mf2.__class__.__name__)
            mf2.setdefault('rels', {})  # mf2util expects rels

        actor = None
        title = None
        hfeed = None
        if mf2:

            def fetch_mf2_func(url):
                if util.domain_or_parent_in(
                        urllib.parse.urlparse(url).netloc, SILO_DOMAINS):
                    return {
                        'items': [{
                            'type': ['h-card'],
                            'properties': {
                                'url': [url]
                            }
                        }]
                    }
                return util.fetch_mf2(url, gateway=True)

            try:
                actor = microformats2.find_author(
                    mf2, fetch_mf2_func=fetch_mf2_func)
                title = microformats2.get_title(mf2)
                hfeed = mf2util.find_first_entry(mf2, ['h-feed'])
            except (KeyError, ValueError) as e:
                raise exc.HTTPBadRequest('Could not parse %s as %s: %s' %
                                         (final_url, input, e))

        try:
            if input in ('as1', 'activitystreams'):
                activities = body_items
            elif input == 'as2':
                activities = [as2.to_as1(obj) for obj in body_items]
            elif input == 'atom':
                try:
                    activities = atom.atom_to_activities(resp.text)
                except ElementTree.ParseError as e:
                    raise exc.HTTPBadRequest('Could not parse %s as XML: %s' %
                                             (final_url, e))
                except ValueError as e:
                    raise exc.HTTPBadRequest('Could not parse %s as Atom: %s' %
                                             (final_url, e))
            elif input == 'html':
                activities = microformats2.html_to_activities(resp,
                                                              url=final_url,
                                                              id=fragment,
                                                              actor=actor)
            elif input in ('mf2-json', 'json-mf2'):
                activities = [
                    microformats2.json_to_object(item, actor=actor)
                    for item in mf2.get('items', [])
                ]
            elif input == 'jsonfeed':
                activities, actor = jsonfeed.jsonfeed_to_activities(body_json)
        except ValueError as e:
            logging.warning('parsing input failed', stack_info=True)
            self.abort(
                400,
                'Could not parse %s as %s: %s' % (final_url, input, str(e)))

        self.write_response(
            source.Source.make_activities_base_response(activities),
            url=final_url,
            actor=actor,
            title=title,
            hfeed=hfeed)
Example #14
0
    def _scrape(self,
                user_id=None,
                activity_id=None,
                cookie=None,
                fetch_extras=False,
                cache=None):
        """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string
      fetch_extras: boolean
      cookie: string

    Returns: list of activities
    """
        assert user_id or activity_id or cookie

        url = (HTML_MEDIA % self.id_to_shortcode(activity_id) if activity_id
               else self.user_url(user_id) if user_id else self.BASE_URL)
        kwargs = {}
        if cookie:
            kwargs = {'headers': {'Cookie': cookie}}
        resp = util.requests_get(url, allow_redirects=False, **kwargs)
        if ((cookie and 'not-logged-in' in resp.text) or
            (resp.status_code in ('301', '302')
             and '/accounts/login' in resp.headers.get('Location', ''))):
            resp.status_code = '401'
            raise requests.HTTPError('401 Unauthorized', response=resp)

        activities, actor = self.html_to_activities(resp.text)

        if fetch_extras and not activity_id:
            # batch get cached counts of comments and likes for all activities
            cached = {}
            # don't update the cache until the end, in case we hit an error before
            cache_updates = {}
            if cache is not None:
                keys = []
                for activity in activities:
                    _, id = util.parse_tag_uri(activity['id'])
                    keys.extend(['AIL ' + id, 'AIC ' + id])
                cached = cache.get_multi(keys)

            for i, activity in enumerate(activities):
                obj = activity['object']
                _, id = util.parse_tag_uri(activity['id'])
                likes = obj.get('ig_like_count') or 0
                comments = obj.get('replies', {}).get('totalItems') or 0
                likes_key = 'AIL %s' % id
                comments_key = 'AIC %s' % id

                if (likes and likes != cached.get(likes_key)
                        or comments and comments != cached.get(comments_key)):
                    full_activity, _ = self.html_to_activities(
                        util.requests_get(activity['url']).text)
                    if full_activity:
                        activities[i] = full_activity[0]
                        cache_updates.update({
                            likes_key: likes,
                            comments_key: comments
                        })

            if cache_updates and cache is not None:
                cache.set_multi(cache_updates)

        resp = self.make_activities_base_response(activities)
        resp['actor'] = actor
        return resp
Example #15
0
  def _scrape(self, user_id=None, activity_id=None, cookie=None,
              fetch_extras=False, cache=None, shortcode=None):
    """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string, e.g. '1020355224898358984_654594'
      fetch_extras: boolean
      cookie: string
      shortcode: string, e.g. '4pB6vEx87I'

    Returns:
      dict activities API response
    """
    assert user_id or activity_id or shortcode or cookie
    assert not (activity_id and shortcode)

    if not shortcode:
      shortcode = self.id_to_shortcode(activity_id)

    url = (HTML_MEDIA % shortcode if shortcode
           else HTML_PROFILE % user_id if user_id
           else HTML_BASE_URL)
    kwargs = {}
    if cookie:
      kwargs = {'headers': {'Cookie': cookie}}
    resp = util.requests_get(url, allow_redirects=False, **kwargs)
    if ((cookie and 'not-logged-in' in resp.text) or
        (resp.status_code in (301, 302) and
         '/accounts/login' in resp.headers.get('Location', ''))):
      resp.status_code = 401
      raise requests.HTTPError('401 Unauthorized', response=resp)
    elif resp.status_code == 404:
      if activity_id:
        return self._scrape(shortcode=activity_id, cookie=cookie)
      # otherwise not found, fall through and return empty response
    else:
      resp.raise_for_status()

    activities, actor = self.html_to_activities(resp.text)

    if fetch_extras and not activity_id:
      # batch get cached counts of comments and likes for all activities
      cached = {}
      # don't update the cache until the end, in case we hit an error before
      cache_updates = {}
      if cache is not None:
        keys = []
        for activity in activities:
          _, id = util.parse_tag_uri(activity['id'])
          keys.extend(['AIL ' + id, 'AIC ' + id])
        cached = cache.get_multi(keys)

      for i, activity in enumerate(activities):
        obj = activity['object']
        _, id = util.parse_tag_uri(activity['id'])
        likes = obj.get('ig_like_count') or 0
        comments = obj.get('replies', {}).get('totalItems') or 0
        likes_key = 'AIL %s' % id
        comments_key = 'AIC %s' % id

        if (likes and likes != cached.get(likes_key) or
            comments and comments != cached.get(comments_key)):
          url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL)
          resp = util.requests_get(url)
          resp.raise_for_status()
          full_activity, _ = self.html_to_activities(resp.text)
          if full_activity:
            activities[i] = full_activity[0]
            cache_updates.update({likes_key: likes, comments_key: comments})

      if cache_updates and cache is not None:
        cache.set_multi(cache_updates)

    resp = self.make_activities_base_response(activities)
    resp['actor'] = actor
    return resp
Example #16
0
    def _scrape(self,
                user_id=None,
                group_id=None,
                activity_id=None,
                cookie=None,
                count=None,
                fetch_extras=False,
                cache=None,
                shortcode=None):
        """Scrapes a user's profile or feed and converts the media to activities.

    Args:
      user_id: string
      activity_id: string, e.g. '1020355224898358984_654594'
      count: integer, number of activities to fetch and return, None for all
      fetch_extras: boolean
      cookie: string
      shortcode: string, e.g. '4pB6vEx87I'

    Returns:
      dict activities API response
    """
        assert user_id or activity_id or shortcode or cookie
        assert not (activity_id and shortcode)

        if not shortcode:
            shortcode = self.id_to_shortcode(activity_id)

        url = (
            HTML_MEDIA % shortcode if shortcode else HTML_PROFILE %
            user_id if user_id and group_id == source.SELF else HTML_BASE_URL)
        kwargs = {}
        if cookie:
            if not cookie.startswith('sessionid='):
                cookie = 'sessionid=' + cookie
            kwargs = {'headers': {'Cookie': cookie}}
        resp = util.requests_get(url, allow_redirects=False, **kwargs)
        if ((cookie and 'not-logged-in' in resp.text) or
            (resp.status_code in (301, 302)
             and '/accounts/login' in resp.headers.get('Location', ''))):
            resp.status_code = 401
            raise requests.HTTPError('401 Unauthorized', response=resp)
        elif resp.status_code == 404:
            if activity_id:
                return self._scrape(shortcode=activity_id,
                                    cookie=cookie,
                                    count=count)
            # otherwise not found, fall through and return empty response
        else:
            resp.raise_for_status()

        activities, actor = self.html_to_activities(resp.text,
                                                    cookie=cookie,
                                                    count=count)

        if fetch_extras:
            # batch get cached counts of comments and likes for all activities
            cached = {}
            # don't update the cache until the end, in case we hit an error before
            cache_updates = {}
            if cache is not None:
                keys = []
                for activity in activities:
                    _, id = util.parse_tag_uri(activity['id'])
                    keys.extend(['AIL ' + id, 'AIC ' + id])
                cached = cache.get_multi(keys)

            for i, activity in enumerate(activities):
                obj = activity['object']
                _, id = util.parse_tag_uri(activity['id'])
                likes = obj.get('ig_like_count') or 0
                comments = obj.get('replies', {}).get('totalItems') or 0
                likes_key = 'AIL %s' % id
                comments_key = 'AIC %s' % id

                if (likes and likes != cached.get(likes_key)
                        or comments and comments != cached.get(comments_key)):
                    if not activity_id and not shortcode:
                        url = activity['url'].replace(self.BASE_URL,
                                                      HTML_BASE_URL)
                        resp = util.requests_get(url)
                        resp.raise_for_status()
                    # otherwise resp is a fetch of just this activity; reuse it

                    full_activity, _ = self.html_to_activities(
                        resp.text,
                        cookie=cookie,
                        count=count,
                        fetch_extras=fetch_extras)
                    if full_activity:
                        activities[i] = full_activity[0]
                        cache_updates.update({
                            likes_key: likes,
                            comments_key: comments
                        })

            if cache_updates and cache is not None:
                cache.set_multi(cache_updates)

        resp = self.make_activities_base_response(activities)
        resp['actor'] = actor
        return resp
Example #17
0
    def html_to_activities(self, html, cookie=None):
        """Converts Instagram HTML to ActivityStreams activities.

    The input HTML may be from:

    * a user's feed, eg https://www.instagram.com/ while logged in
    * a user's profile, eg https://www.instagram.com/snarfed/
    * a photo or video, eg https://www.instagram.com/p/BBWCSrfFZAk/

    Args:
      html: unicode string
      cookie: string, optional sessionid cookie to be used for subsequent HTTP
        fetches, if necessary.

    Returns:
      tuple, ([ActivityStreams activities], ActivityStreams viewer actor)
    """
        # extract JSON data blob
        # (can also get just this JSON by adding ?__a=1 to any IG URL.)
        script_start = '<script type="text/javascript">window._sharedData = '
        start = html.find(script_start)
        if start == -1:
            # Instagram sometimes returns 200 with incomplete HTML. often it stops at
            # the end of one of the <style> tags inside <head>. not sure why.
            logging.warning('JSON script tag not found!')
            return [], None

        # App Engine's Python 2.7.5 json module doesn't support unpaired surrogate
        # Unicode chars, so it chokes on some JSON docs. Monkey patch in simplejson
        # to fix that.
        # https://code.google.com/p/googleappengine/issues/detail?id=12823
        # http://stackoverflow.com/questions/15236742
        try:
            import simplejson
            json_module = simplejson
        except ImportError:
            json_module = json

        start += len(script_start)
        end = html.find(';</script>', start)
        if end == -1:
            # as mentioned above, Instagram sometimes returns 200 with incomplete HTML
            logging.warning('JSON script close tag not found!')
            return [], None
        data = util.trim_nulls(json_module.loads(html[start:end]))

        entry_data = data.get('entry_data', {})
        activities = []

        # find media
        medias = []
        profile_user = None

        # home page ie news feed
        for page in entry_data.get('FeedPage', []):
            edges = page.get('graphql', {}).get('user', {})\
                        .get('edge_web_feed_timeline', {}).get('edges', [])
            medias.extend(
                e.get('node') for e in edges
                if e.get('node', {}).get('__typename') not in (
                    'GraphSuggestedUserFeedUnit', ))

        # profiles
        for page in entry_data.get('ProfilePage', []):
            profile_user = page.get('graphql', {}).get('user', {})
            medias.extend(edge['node'] for edge in profile_user.get(
                'edge_owner_to_timeline_media', {}).get('edges', [])
                          if edge.get('node'))

        # individual photo/video permalinks
        for page in entry_data.get('PostPage', []):
            media = page.get('graphql', {}).get('shortcode_media')
            if media:
                medias.append(media)

        if not medias:
            # As of 2018-02-15, embedded JSON in logged in https://www.instagram.com/
            # no longer has any useful data. Need to do a second header link fetch.
            soup = BeautifulSoup(html)
            link = soup.find('link', href=HTML_PRELOAD_RE)
            if link:
                url = urllib.parse.urljoin(HTML_BASE_URL, link['href'])
                headers = {'Cookie': cookie} if cookie else None
                resp = util.requests_get(url,
                                         allow_redirects=False,
                                         headers=headers)

                try:
                    data = resp.json()
                except ValueError as e:
                    msg = "Couldn't decode response as JSON:\n%s" % resp.text
                    logging.exception(msg)
                    resp.status_code = 504
                    raise requests.HTTPError(
                        '504 Bad response from Instagram\n' + msg,
                        response=resp)

                edges = data.get('data', {}).get('user', {})\
                            .get('edge_web_feed_timeline', {}).get('edges', [])
                medias = [e.get('node') for e in edges]

        for media in util.trim_nulls(medias):
            activities.append(
                self._json_media_node_to_activity(media, user=profile_user))

        actor = None
        user = self._json_user_to_user(
            data.get('config', {}).get('viewer') or profile_user)
        if user:
            actor = self.user_to_actor(user)

        return activities, actor