Esempio n. 1
0
def requests_get(url, **kwargs):
  """Wraps requests.get and injects our timeout and user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to requests.get so that it doesn't fetch the response body until
  we access response.content (or .text).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
  if url in URL_BLACKLIST:
    resp = requests.Response()
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
    return resp

  kwargs.setdefault('headers', {}).update(USER_AGENT_HEADER)
  kwargs.setdefault('timeout', HTTP_TIMEOUT)
  resp = requests.get(url, stream=True, **kwargs)

  length = resp.headers.get('Content-Length', 0)
  if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE:
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = ('Content-Length %s is larger than our limit %s.' %
                                  (length, MAX_HTTP_RESPONSE_SIZE))

  return resp
Esempio n. 2
0
    def _fetch(self, url):
        """Fetches url and returns (string final url, unicode body)."""
        try:
            resp = util.requests_get(url, stream=True)
        except (ValueError, requests.URLRequired,
                requests.TooManyRedirects) as e:
            self.abort(400, str(e))
            # other exceptions are handled by webutil.handlers.handle_exception(),
            # which uses interpret_http_exception(), etc.

        if url != resp.url:
            url = resp.url
            logging.info('Redirected to %s', url)

        body = resp.text

        length = resp.headers.get('Content-Length')
        if util.is_int(length):
            length = int(length)
        if not length:
            length = len(body)
        if length > MAX_HTTP_RESPONSE_SIZE:
            self.abort(
                HTTP_RESPONSE_TOO_BIG_STATUS_CODE,
                'Content-Length %s for %s is larger than our limit of %s.' %
                (length, url, MAX_HTTP_RESPONSE_SIZE))

        return url, body
Esempio n. 3
0
def requests_get(url, **kwargs):
    """Wraps requests.get with extra semantics and our user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to requests.get so that it doesn't fetch the response body until
  we access response.content (or .text).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
    if url in URL_BLACKLIST:
        resp = requests.Response()
        resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
        resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
        return resp

    kwargs.setdefault('headers', {}).update(USER_AGENT_HEADER)
    resp = util.requests_get(url, stream=True, **kwargs)

    length = resp.headers.get('Content-Length', 0)
    if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE:
        resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
        resp._text = resp._content = (
            'Content-Length %s is larger than our limit %s.' %
            (length, MAX_HTTP_RESPONSE_SIZE))

    return resp
Esempio n. 4
0
def requests_get(url, **kwargs):
  """Wraps :func:`requests.get` with extra semantics and our user agent.

  If a server tells us a response will be too big (based on Content-Length), we
  hijack the response and return 599 and an error response body instead. We pass
  stream=True to :func:`requests.get` so that it doesn't fetch the response body
  until we access :attr:`requests.Response.content` (or
  :attr:`requests.Response.text`).

  http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow
  """
  if url in URL_BLACKLIST:
    resp = requests.Response()
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.'
    return resp

  kwargs.setdefault('headers', {}).update(request_headers(url=url))
  resp = util.requests_get(url, stream=True, **kwargs)

  length = resp.headers.get('Content-Length', 0)
  if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE:
    resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE
    resp._text = resp._content = ('Content-Length %s is larger than our limit %s.' %
                                  (length, MAX_HTTP_RESPONSE_SIZE))

  return resp
Esempio n. 5
0
  def _fetch(self, url):
    """Fetches url and returns (string final url, unicode body)."""
    try:
      resp = util.requests_get(url, stream=True)
    except (ValueError, requests.URLRequired, requests.TooManyRedirects) as e:
      self.abort(400, str(e))
      # other exceptions are handled by webutil.handlers.handle_exception(),
      # which uses interpret_http_exception(), etc.

    if url != resp.url:
      url = resp.url
      logging.info('Redirected to %s', url)

    body = resp.text

    length = resp.headers.get('Content-Length')
    if util.is_int(length):
      length = int(length)
    if not length:
      length = len(body)
    if length > MAX_HTTP_RESPONSE_SIZE:
      self.abort(HTTP_RESPONSE_TOO_BIG_STATUS_CODE,
                 'Content-Length %s for %s is larger than our limit of %s.' %
                 (length, url, MAX_HTTP_RESPONSE_SIZE))

    return url, body
Esempio n. 6
0
  def user_to_actor(self, user):
    """Converts a user or page to an actor.

    Args:
      user: dict, a decoded JSON Facebook user or page

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    handle = username or id
    if not handle:
      return {}

    # facebook implements this as a 302 redirect
    actor = {
      # FB only returns the type field if you fetch the object with ?metadata=1
      # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection
      'objectType': 'page' if user.get('type') == 'page' else 'person',
      'displayName': user.get('name') or username,
      'id': self.tag_uri(handle),
      'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')),
      'username': username,
      'description': user.get('bio') or user.get('description'),
      'summary': user.get('about'),
      }

    # numeric_id is our own custom field that always has the source's numeric
    # user id, if available.
    if util.is_int(id):
      actor.update({
        'numeric_id': id,
        'image': {
          'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id,
        },
      })

    # extract web site links. extract_links uniquifies and preserves order
    urls = util.extract_links(user.get('website'))
    if not urls:
      urls = util.extract_links(user.get('link')) or [self.user_url(handle)]
    actor['url'] = urls[0]
    if len(urls) > 1:
      actor['urls'] = [{'value': u} for u in urls]

    location = user.get('location')
    if location:
      actor['location'] = {'id': location.get('id'),
                           'displayName': location.get('name')}

    return util.trim_nulls(actor)
Esempio n. 7
0
  def user_to_actor(self, user):
    """Converts a user or page to an actor.

    Args:
      user: dict, a decoded JSON Facebook user or page

    Returns:
      an ActivityStreams actor dict, ready to be JSON-encoded
    """
    if not user:
      return {}

    id = user.get('id')
    username = user.get('username')
    handle = username or id
    if not handle:
      return {}

    # facebook implements this as a 302 redirect
    actor = {
      # FB only returns the type field if you fetch the object with ?metadata=1
      # https://developers.facebook.com/docs/graph-api/using-graph-api/v2.2#introspection
      'objectType': 'page' if user.get('type') == 'page' else 'person',
      'displayName': user.get('name') or username,
      'id': self.tag_uri(handle),
      'updated': util.maybe_iso8601_to_rfc3339(user.get('updated_time')),
      'username': username,
      'description': user.get('bio') or user.get('description'),
      'summary': user.get('about'),
      }

    # numeric_id is our own custom field that always has the source's numeric
    # user id, if available.
    if util.is_int(id):
      actor.update({
        'numeric_id': id,
        'image': {
          'url': 'https://graph.facebook.com/v2.2/%s/picture?type=large' % id,
        },
      })

    # extract web site links. extract_links uniquifies and preserves order
    urls = util.extract_links(user.get('website'))
    if not urls:
      urls = util.extract_links(user.get('link')) or [self.user_url(handle)]
    actor['url'] = urls[0]
    if len(urls) > 1:
      actor['urls'] = [{'value': u} for u in urls]

    location = user.get('location')
    if location:
      actor['location'] = {'id': location.get('id'),
                           'displayName': location.get('name')}

    return util.trim_nulls(actor)
Esempio n. 8
0
  def base_id(cls, url):
    """Extracts and returns a USERNAME:REPO:ID id for an issue or PR.

    Args:
      url: string

    Returns:
      string, or None
    """
    parts = urllib.parse.urlparse(url).path.strip('/').split('/')
    if len(parts) == 4 and util.is_int(parts[3]):
      return ':'.join((parts[0], parts[1], parts[3]))
Esempio n. 9
0
  def user_to_actor(self, account):
    """Converts a Mastodon account to an AS1 actor.

    Args:
      account: dict, Mastodon account

    Returns: dict, AS1 actor
    """
    domain = self.DOMAIN
    username = account.get('username')

    # parse acct. it's just username for local accounts but fully qualified
    # address for remote accounts, eg [email protected].
    acct = account.get('acct') or ''
    split = acct.split('@')
    if len(split) in (2, 3):
      acct_username, acct_domain = split[-2:]
      if acct_domain:
        domain = acct_domain
      if not username:
        username = acct[-2]
      elif acct_username and username != acct_username:
        raise ValueError('username %s and acct %s conflict!' % (username, acct))

    if not username:
      return {}

    url = account.get('url')
    # mastodon's 'Web site' fields are HTML links, so extract their URLs
    web_sites = sum((util.extract_links(f.get('value'))
                     for f in (account.get('fields') or [])), [])

    # account.created_at is string ISO8601 in Mastodon, int timestamp in Pixelfed
    published = account.get('created_at')
    if util.is_int(published) or util.is_float(published):
      published = util.maybe_timestamp_to_iso8601(published)

    return util.trim_nulls({
      'objectType': 'person',
      'id': util.tag_uri(domain, username),
      'numeric_id': account.get('id'),
      'username': username,
      'displayName': account.get('display_name') or acct or username,
      'url': url,
      'urls': [{'value': u} for u in [url] + web_sites],
      'image': {'url': account.get('avatar')},
      'published': published,
      'description': account.get('note'),
    })
Esempio n. 10
0
def size_to_bytes(size):
  """Converts a string file size to an integer number of bytes.

  Args:
    size, string, may be either integer bytes or human-readable approximation,
      eg 7MB or 1.23 kb

  Returns: integer, bytes, or None if size can't be parsed
  """
  if util.is_int(size):
    return int(size)

  if not size:
    return None

  try:
    return humanfriendly.parse_size(size)
  except humanfriendly.InvalidSize:
    logging.debug("Couldn't parse size %r", size)
Esempio n. 11
0
  def get_comment(self, comment_id, **kwargs):
    """Fetches and returns a comment.

    Args:
      comment_id: string comment id (either REST or GraphQL), of the form
        REPO-OWNER_REPO-NAME_ID, e.g. snarfed:bridgy:456789

    Returns: dict, an ActivityStreams comment object
    """
    parts = tuple(comment_id.split(':'))
    if len(parts) != 3:
      raise ValueError('GitHub comment ids must be of the form USER:REPO:COMMENT_ID')

    if util.is_int(parts[2]):  # REST API id
      comment = self.rest(REST_API_COMMENT % parts).json()
    else:  # GraphQL node id
      comment = self.graphql(GRAPHQL_COMMENT, {'id': parts[2]})['node']

    return self.comment_to_object(comment)
Esempio n. 12
0
    def id_to_shortcode(id):
        """Converts a media id to the shortcode used in its instagram.com URL.

    Based on http://carrot.is/coding/instagram-ids , which determined that
    shortcodes are just URL-safe base64 encoded ids.
    """
        if not id:
            return None

        if isinstance(id, basestring):
            parts = id.split('_')
            if not util.is_int(parts[0]):
                return id
            id = int(parts[0])

        chars = []
        while id > 0:
            id, rem = divmod(id, 64)
            chars.append(BASE64[rem])

        return ''.join(reversed(chars))
Esempio n. 13
0
def get_webmention_target(url, resolve=True, replace_test_domains=True):
    """Resolves a URL and decides whether we should try to send it a webmention.

  Note that this ignores failed HTTP requests, ie the boolean in the returned
  tuple will be true! TODO: check callers and reconsider this.

  Args:
    url: string
    resolve: whether to follow redirects
    replace_test_domains: whether to replace test user domains with localhost

  Returns:
    (string url, string pretty domain, boolean) tuple. The boolean is
    True if we should send a webmention, False otherwise, e.g. if it's a bad
    URL, not text/html, or in the blacklist.
  """
    url = util.clean_url(url)
    try:
        domain = domain_from_link(url).lower()
    except BaseException:
        logging.info('Dropping bad URL %s.', url)
        return url, None, False

    send = True
    if resolve:
        # this follows *all* redirects, until the end
        resolved = follow_redirects(url, cache=memcache)
        html = resolved.headers.get('content-type', '').startswith('text/html')
        length = resolved.headers.get('Content-Length', 0)
        too_big = util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE
        send = html and not too_big
        url, domain, _ = get_webmention_target(
            resolved.url,
            resolve=False,
            replace_test_domains=replace_test_domains)

    send = send and domain and not in_webmention_blacklist(domain)
    if replace_test_domains:
        url = replace_test_domains_with_localhost(url)
    return url, domain, send
Esempio n. 14
0
  def id_to_shortcode(id):
    """Converts a media id to the shortcode used in its instagram.com URL.

    Based on http://carrot.is/coding/instagram-ids , which determined that
    shortcodes are just URL-safe base64 encoded ids.
    """
    if not id:
      return None

    if isinstance(id, basestring):
      parts = id.split('_')
      if not util.is_int(parts[0]):
        return id
      id = int(parts[0])

    A = ord('A')
    chars = []
    while id > 0:
      id, rem = divmod(id, 64)
      chars.append(BASE64[rem])

    return ''.join(reversed(chars))
Esempio n. 15
0
def get_webmention_target(url, resolve=True, replace_test_domains=True):
  """Resolves a URL and decides whether we should try to send it a webmention.

  Note that this ignores failed HTTP requests, ie the boolean in the returned
  tuple will be true! TODO: check callers and reconsider this.

  Args:
    url: string
    resolve: whether to follow redirects
    replace_test_domains: whether to replace test user domains with localhost

  Returns:
    (string url, string pretty domain, boolean) tuple. The boolean is
    True if we should send a webmention, False otherwise, e.g. if it's a bad
    URL, not text/html, or in the blacklist.
  """
  url = util.clean_url(url)
  try:
    domain = domain_from_link(url).lower()
  except BaseException:
    logging.info('Dropping bad URL %s.', url)
    return url, None, False

  send = True
  if resolve:
    # this follows *all* redirects, until the end
    resolved = follow_redirects(url, cache=memcache)
    html = resolved.headers.get('content-type', '').startswith('text/html')
    length = resolved.headers.get('Content-Length', 0)
    too_big = util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE
    send = html and not too_big
    url, domain, _ = get_webmention_target(
      resolved.url, resolve=False, replace_test_domains=replace_test_domains)

  send = send and domain and not in_webmention_blacklist(domain)
  if replace_test_domains:
    url = replace_test_domains_with_localhost(url)
  return url, domain, send
Esempio n. 16
0
  def upload_video(self, url):
    """Uploads a video from web URLs using the chunked upload process.

    Chunked upload consists of multiple API calls:
    * command=INIT, which allocates the media id
    * command=APPEND for each 5MB block, up to 15MB total
    * command=FINALIZE

    https://dev.twitter.com/rest/reference/post/media/upload-chunked
    https://dev.twitter.com/rest/public/uploading-media#chunkedupload

    Args:
      url: string URL of images

    Returns: string media id or CreationResult on error
    """
    video_resp = util.urlopen(url)
    bad_type = self._check_mime_type(url, video_resp, VIDEO_MIME_TYPES, 'MP4 videos')
    if bad_type:
      return bad_type

    length = video_resp.headers.get('Content-Length')
    if not util.is_int(length):
      msg = "Couldn't determine your video's size."
      return source.creation_result(abort=True, error_plain=msg, error_html=msg)

    length = int(length)
    if int(length) > MAX_VIDEO_SIZE:
      msg = "Your %sMB video is larger than Twitter's %dMB limit." % (
        length // MB, MAX_VIDEO_SIZE // MB)
      return source.creation_result(abort=True, error_plain=msg, error_html=msg)

    # INIT
    media_id = self.urlopen(API_UPLOAD_MEDIA, data=urllib.urlencode({
      'command': 'INIT',
      'media_type': 'video/mp4',
      'total_bytes': length,
    }))['media_id_string']

    # APPEND
    headers = twitter_auth.auth_header(
      API_UPLOAD_MEDIA, self.access_token_key, self.access_token_secret, 'POST')

    i = 0
    while True:
      chunk = util.FileLimiter(video_resp, UPLOAD_CHUNK_SIZE)
      data = {
        'command': 'APPEND',
        'media_id': media_id,
        'segment_index': i,
      }
      resp = util.requests_post(API_UPLOAD_MEDIA, data=data,
                                files={'media': chunk}, headers=headers)
      resp.raise_for_status()

      if chunk.ateof:
        break
      i += 1

    # FINALIZE
    self.urlopen(API_UPLOAD_MEDIA, data=urllib.urlencode({
      'command': 'FINALIZE',
      'media_id': media_id,
    }))

    return media_id
Esempio n. 17
0
  def base_object(self, obj, verb=None, resolve_numeric_id=False):
    """Returns the 'base' silo object that an object operates on.

    This is mostly a big bag of heuristics for reverse engineering and
    parsing Facebook URLs. Whee.

    Args:
      obj: ActivityStreams object
      verb: string, optional
      resolve_numeric_id: if True, tries harder to populate the numeric_id field
        by making an additional API call to look up the object if necessary.

    Returns: dict, minimal ActivityStreams object. Usually has at least id,
      numeric_id, and url fields; may also have author.
    """
    base_obj = super(Facebook, self).base_object(obj)

    url = base_obj.get('url')
    if not url:
      return base_obj

    author = base_obj.setdefault('author', {})
    base_id = base_obj.get('id')
    if base_id and not base_obj.get('numeric_id'):
      if util.is_int(base_id):
        base_obj['numeric_id'] = base_id
      elif resolve_numeric_id:
        base_obj = self.user_to_actor(self.urlopen(base_id))

    try:
      parsed = urlparse.urlparse(url)
      params = urlparse.parse_qs(parsed.query)
      assert parsed.path.startswith('/')
      path = parsed.path.strip('/')
      path_parts = path.split('/')

      if len(path_parts) == 1:
        if not base_obj.get('objectType'):
          base_obj['objectType'] = 'person'  # or page
        if not base_id:
          base_id = base_obj['id'] = path_parts[0]
        # this is a gross hack - adding the FB username field to an AS object
        # and then re-running user_to_actor - but it's an easy/reusable way to
        # populate image, displayName, etc.
        if not base_obj.get('username') and not util.is_int(base_id):
          base_obj['username'] = base_id
        base_obj.update({k: v for k, v in self.user_to_actor(base_obj).items()
                         if k not in base_obj})

      elif len(path_parts) >= 3 and path_parts[1] == 'posts':
        author_id = path_parts[0]
        if not author.get('id'):
          author['id'] = author_id
        if util.is_int(author_id) and not author.get('numeric_id'):
          author['numeric_id'] = author_id

      # photo URLs look like:
      # https://www.facebook.com/photo.php?fbid=123&set=a.4.5.6&type=1
      # https://www.facebook.com/user/photos/a.12.34.56/78/?type=1&offset=0
      if path == 'photo.php':
        fbids = params.get('fbid')
        if fbids:
          base_obj['id'] = fbids[0]

      # photo album URLs look like this:
      # https://www.facebook.com/media/set/?set=a.12.34.56
      # c.f. http://stackoverflow.com/questions/18549744
      elif path == 'media/set':
        set_id = params.get('set')
        if set_id and set_id[0].startswith('a.'):
          base_obj['id'] = set_id[0].split('.')[1]

      comment_id = params.get('comment_id') or params.get('reply_comment_id')
      if comment_id:
        base_obj['id'] += '_' + comment_id[0]
        base_obj['objectType'] = 'comment'

      if '_' not in base_id and author.get('numeric_id'):
        # add author user id prefix. https://github.com/snarfed/bridgy/issues/229
        base_obj['id'] = '%s_%s' % (author['numeric_id'], base_id)

    except BaseException, e:
      logging.error(
        "Couldn't parse object URL %s : %s. Falling back to default logic.",
        url, e)
Esempio n. 18
0
def object_to_json(obj, trim_nulls=True, entry_class='h-entry',
                   default_object_type=None, synthesize_content=True):
  """Converts an ActivityStreams object to microformats2 JSON.

  Args:
    obj: dict, a decoded JSON ActivityStreams object
    trim_nulls: boolean, whether to remove elements with null or empty values
    entry_class: string or sequence, the mf2 class(es) that entries should be
      given (e.g. 'h-cite' when parsing a reference to a foreign entry).
      defaults to 'h-entry'
    default_object_type: string, the ActivityStreams objectType to use if one
      is not present. defaults to None
    synthesize_content: whether to generate synthetic content if the object
      doesn't have its own, e.g. 'likes this.' or 'shared this.'

  Returns:
    dict, decoded microformats2 JSON
  """
  if not obj or not isinstance(obj, dict):
    return {}

  obj_type = source.object_type(obj) or default_object_type
  # if the activity type is a post, then it's really just a conduit
  # for the object. for other verbs, the activity itself is the
  # interesting thing
  if obj_type == 'post':
    primary = obj.get('object', {})
    obj_type = source.object_type(primary) or default_object_type
  else:
    primary = obj

  # TODO: extract snippet
  name = primary.get('displayName', primary.get('title'))
  summary = primary.get('summary')
  author = obj.get('author', obj.get('actor', {}))

  in_reply_tos = obj.get('inReplyTo') or []
  if not in_reply_tos:
    context = obj.get('context')
    if context and isinstance(context, dict):
      in_reply_tos = context.get('inReplyTo') or []

  is_rsvp = obj_type in ('rsvp-yes', 'rsvp-no', 'rsvp-maybe')
  if (is_rsvp or obj_type == 'react') and obj.get('object'):
    objs = obj['object']
    in_reply_tos.extend(objs if isinstance(objs, list) else [objs])

  # maps objectType to list of objects
  attachments = defaultdict(list)
  for prop in 'attachments', 'tags':
    for elem in get_list(primary, prop):
      attachments[elem.get('objectType')].append(elem)

  # prefer duration and size from object's stream, then first video, then first
  # audio
  stream = {}
  for candidate in [obj] + attachments['video'] + attachments['audio']:
    for stream in get_list(candidate, 'stream'):
      if stream:
        break

  duration = stream.get('duration')
  if duration is not None:
    if util.is_int(duration):
      duration = str(duration)
    else:
      logging('Ignoring duration %r; expected int, got %s', duration.__class__)
      duration = None

  sizes = []
  size = stream.get('size')
  if size:
    sizes = [str(size)]

  # construct mf2!
  ret = {
    'type': (AS_TO_MF2_TYPE.get(obj_type) or
             [entry_class] if isinstance(entry_class, str)
             else list(entry_class)),
    'properties': {
      'uid': [obj.get('id') or ''],
      'numeric-id': [obj.get('numeric_id') or ''],
      'name': [name],
      'nickname': [obj.get('username') or ''],
      'summary': [summary],
      'url': (list(object_urls(obj) or object_urls(primary)) +
              obj.get('upstreamDuplicates', [])),
      # photo is special cased below, to handle alt
      'video': dedupe_urls(get_urls(attachments, 'video', 'stream') +
                           get_urls(primary, 'stream')),
      'audio': get_urls(attachments, 'audio', 'stream'),
      'duration': [duration],
      'size': sizes,
      'published': [obj.get('published', primary.get('published', ''))],
      'updated': [obj.get('updated', primary.get('updated', ''))],
      'in-reply-to': util.trim_nulls([o.get('url') for o in in_reply_tos]),
      'author': [object_to_json(
        author, trim_nulls=False, default_object_type='person')],
      'location': [object_to_json(
        primary.get('location', {}), trim_nulls=False,
        default_object_type='place')],
      'comment': [object_to_json(c, trim_nulls=False, entry_class='h-cite')
                  for c in obj.get('replies', {}).get('items', [])],
      'start': [primary.get('startTime')],
      'end': [primary.get('endTime')],
    },
    'children': (
      # silly hack: i haven't found anywhere in AS1 or AS2 to indicate that
      # something is being "quoted," like in a quote tweet, so i cheat and use
      # extra knowledge here that quoted tweets are converted to note
      # attachments, but URLs in the tweet text are converted to article tags.
      [object_to_json(a, trim_nulls=False, entry_class=['u-quotation-of', 'h-cite'])
       for a in attachments['note'] if 'startIndex' not in a] +
      [object_to_json(a, trim_nulls=False, entry_class=['h-cite'])
       for a in attachments['article'] if 'startIndex' not in a])
  }

  # content. emulate e- vs p- microformats2 parsing: e- if there are HTML tags,
  # otherwise p-.
  # https://indiewebcamp.com/note#Indieweb_whitespace_thinking
  text = xml.sax.saxutils.unescape(primary.get('content', ''))
  html = render_content(primary, include_location=False,
                        synthesize_content=synthesize_content)
  if '<' in html:
    ret['properties']['content'] = [{'value': text, 'html': html}]
  else:
    ret['properties']['content'] = [text]

  # photos, including alt text
  photo_urls = set()
  ret['properties']['photo'] = []
  for image in get_list(attachments, 'image') + [primary]:
    for url in get_urls(image, 'image'):
      if url and url not in photo_urls:
        photo_urls.add(url)
        name = get_first(image, 'image', {}).get('displayName')
        ret['properties']['photo'].append({'value': url, 'alt': name} if name else url)

  # hashtags and person tags
  if obj_type == 'tag':
    ret['properties']['tag-of'] = util.get_urls(obj, 'target')

  tags = obj.get('tags', []) or get_first(obj, 'object', {}).get('tags', [])
  if not tags and obj_type == 'tag':
    tags = util.get_list(obj, 'object')
  ret['properties']['category'] = []
  for tag in tags:
    if tag.get('objectType') == 'person':
      ret['properties']['category'].append(
        object_to_json(tag, entry_class='u-category h-card'))
    elif tag.get('objectType') == 'hashtag' or obj_type == 'tag':
      name = tag.get('displayName')
      if name:
        ret['properties']['category'].append(name)

  # rsvp
  if is_rsvp:
    ret['properties']['rsvp'] = [obj_type[len('rsvp-'):]]
  elif obj_type == 'invite':
    invitee = object_to_json(obj.get('object'), trim_nulls=False,
                             default_object_type='person')
    ret['properties']['invitee'] = [invitee]

  # like and repost mentions
  for type, prop in (
      ('favorite', 'like'),
      ('follow', 'follow'),
      ('like', 'like'),
      ('share', 'repost'),
  ):
    if obj_type == type:
      # The ActivityStreams spec says the object property should always be a
      # single object, but it's useful to let it be a list, e.g. when a like has
      # multiple targets, e.g. a like of a post with original post URLs in it,
      # which brid.gy does.
      objs = get_list(obj, 'object')
      ret['properties'][prop + '-of'] = [
        # flatten contexts that are just a url
        o['url'] if 'url' in o and set(o.keys()) <= set(['url', 'objectType'])
        else object_to_json(o, trim_nulls=False, entry_class='h-cite')
        for o in objs]
    else:
      # received likes and reposts
      ret['properties'][prop] = [
        object_to_json(t, trim_nulls=False, entry_class='h-cite')
        for t in tags if source.object_type(t) == type]

  # latitude & longitude
  lat = long = None
  position = ISO_6709_RE.match(primary.get('position') or '')
  if position:
    lat, long = position.groups()
  if not lat:
    lat = primary.get('latitude')
  if not long:
    long = primary.get('longitude')

  if lat:
    ret['properties']['latitude'] = [str(lat)]
  if long:
    ret['properties']['longitude'] = [str(long)]

  if trim_nulls:
    ret = util.trim_nulls(ret)
  return ret
Esempio n. 19
0
  def upload_video(self, url):
    """Uploads a video from web URLs using the chunked upload process.

    Chunked upload consists of multiple API calls:
    * command=INIT, which allocates the media id
    * command=APPEND for each 5MB block, up to 15MB total
    * command=FINALIZE

    https://dev.twitter.com/rest/reference/post/media/upload-chunked
    https://dev.twitter.com/rest/public/uploading-media#chunkedupload

    Args:
      url: string URL of images

    Returns: string media id or CreationResult on error
    """
    video_resp = util.urlopen(url)

    # check format and size
    type = video_resp.headers.get('Content-Type')
    if not type:
      type, _ = mimetypes.guess_type(url)
    if type and type not in VIDEO_MIME_TYPES:
      msg = 'Twitter only supports MP4 videos; yours looks like a %s.' % type
      return source.creation_result(abort=True, error_plain=msg, error_html=msg)

    length = video_resp.headers.get('Content-Length')
    if not util.is_int(length):
      msg = "Couldn't determine your video's size."
      return source.creation_result(abort=True, error_plain=msg, error_html=msg)

    length = int(length)
    if int(length) > MAX_VIDEO_SIZE:
      msg = "Your %sMB video is larger than Twitter's %dMB limit." % (
        length // MB, MAX_VIDEO_SIZE // MB)
      return source.creation_result(abort=True, error_plain=msg, error_html=msg)

    # INIT
    media_id = self.urlopen(API_UPLOAD_MEDIA, data=urllib.urlencode({
      'command': 'INIT',
      'media_type': 'video/mp4',
      'total_bytes': length,
    }))['media_id_string']

    # APPEND
    headers = twitter_auth.auth_header(
      API_UPLOAD_MEDIA, self.access_token_key, self.access_token_secret, 'POST')

    i = 0
    while True:
      chunk = util.FileLimiter(video_resp, UPLOAD_CHUNK_SIZE)
      data = {
        'command': 'APPEND',
        'media_id': media_id,
        'segment_index': i,
      }
      resp = util.requests_post(API_UPLOAD_MEDIA, data=data,
                                files={'media': chunk}, headers=headers)
      resp.raise_for_status()

      if chunk.ateof:
        break
      i += 1

    # FINALIZE
    self.urlopen(API_UPLOAD_MEDIA, data=urllib.urlencode({
      'command': 'FINALIZE',
      'media_id': media_id,
    }))

    return media_id
Esempio n. 20
0
def json_to_object(mf2, actor=None, fetch_mf2=False):
  """Converts a single microformats2 JSON item to an ActivityStreams object.

  Supports h-entry, h-event, h-card, and other single item times. Does *not* yet
  support h-feed.

  Args:
    mf2: dict, decoded JSON microformats2 object
    actor: optional author AS actor object. usually comes from a rel="author"
      link. if mf2 has its own author, that will override this.
    fetch_mf2: boolean, whether to fetch additional pages via HTTP if necessary,
      e.g. to determine authorship: https://indieweb.org/authorship

  Returns:
    dict, ActivityStreams object
  """
  if not mf2 or not isinstance(mf2, dict):
    return {}

  mf2 = copy.copy(mf2)
  props = mf2.setdefault('properties', {})
  prop = first_props(props)
  rsvp = prop.get('rsvp')

  # convert author
  mf2_author = prop.get('author')
  if mf2_author and isinstance(mf2_author, dict):
    author = json_to_object(mf2_author)
  else:
    # the author h-card may be on another page. run full authorship algorithm:
    # https://indieweb.org/authorship
    author = mf2util.find_author({'items': [mf2]}, hentry=mf2,
                                 fetch_mf2_func=util.fetch_mf2 if fetch_mf2 else None)
    if author:
      author = {
        'objectType': 'person',
        'url': author.get('url'),
        'displayName': author.get('name'),
        'image': [{'url': author.get('photo')}],
      }

  if not author:
    author = actor

  mf2_types = mf2.get('type') or []
  if 'h-geo' in mf2_types or 'p-location' in mf2_types:
    mf2_type = 'location'
  elif 'tag-of' in props:
    # TODO: remove once this is in mf2util
    # https://github.com/kylewm/mf2util/issues/18
    mf2_type = 'tag'
  elif 'follow-of' in props: # ditto
    mf2_type = 'follow'
  else:
    # mf2 'photo' type is a note or article *with* a photo, but AS 'photo' type
    # *is* a photo. so, special case photo type to fall through to underlying
    # mf2 type without photo.
    # https://github.com/snarfed/bridgy/issues/702
    without_photo = copy.deepcopy(mf2)
    without_photo.get('properties', {}).pop('photo', None)
    mf2_type = mf2util.post_type_discovery(without_photo)

  as_type, as_verb = MF2_TO_AS_TYPE_VERB.get(mf2_type, (None, None))
  if rsvp:
    as_verb = 'rsvp-%s' % rsvp

  # special case GitHub issues that are in-reply-to the repo or its issues URL
  in_reply_tos = get_string_urls(props.get('in-reply-to', []))
  for url in in_reply_tos:
    if re.match(r'^https?://github.com/[^/]+/[^/]+(/issues)?/?$', url):
      as_type = 'issue'

  def is_absolute(url):
    """Filter out relative and invalid URLs (mf2py gives absolute urls)."""
    return urllib.parse.urlparse(url).netloc

  urls = props.get('url') and get_string_urls(props.get('url'))

  # quotations: https://indieweb.org/quotation#How_to_markup
  attachments = [
    json_to_object(quote)
    for quote in mf2.get('children', []) + props.get('quotation-of', [])
    if isinstance(quote, dict) and 'h-cite' in set(quote.get('type', []))]

  # audio and video
  #
  # the duration mf2 property is still emerging. examples in the wild use both
  # integer seconds and ISO 8601 durations.
  # https://indieweb.org/duration
  # https://en.wikipedia.org/wiki/ISO_8601#Durations
  duration = prop.get('duration') or prop.get('length')
  if duration:
    if util.is_int(duration):
      duration = int(duration)
    else:
      parsed = util.parse_iso8601_duration(duration)
      if parsed:
        duration = int(parsed.total_seconds())
      else:
        logging.debug('Unknown format for length or duration %r', duration)
        duration = None


  stream = None
  bytes = size_to_bytes(prop.get('size'))
  for type in 'audio', 'video':
    atts = [{
      'objectType': type,
      'stream': {
        'url': url,
        # integer seconds: http://activitystrea.ms/specs/json/1.0/#media-link
        'duration': duration,
        # file size in bytes. nonstandard, not in AS1 or AS2
        'size': bytes,
      },
    } for url in get_string_urls(props.get(type, []))]
    attachments.extend(atts)
    if atts:
      stream = atts[0]['stream']

  obj = {
    'id': prop.get('uid'),
    'objectType': as_type,
    'verb': as_verb,
    'published': prop.get('published', ''),
    'updated': prop.get('updated', ''),
    'startTime': prop.get('start'),
    'endTime': prop.get('end'),
    'displayName': get_text(prop.get('name')),
    'summary': get_text(prop.get('summary')),
    'content': get_html(prop.get('content')),
    'url': urls[0] if urls else None,
    'urls': [{'value': u} for u in urls] if urls and len(urls) > 1 else None,
    # image is special cased below, to handle alt
    'stream': [stream],
    'location': json_to_object(prop.get('location')),
    'replies': {'items': [json_to_object(c) for c in props.get('comment', [])]},
    'tags': [{'objectType': 'hashtag', 'displayName': cat}
             if isinstance(cat, str)
             else json_to_object(cat)
             for cat in props.get('category', [])],
    'attachments': attachments,
  }

  # images, including alt text
  photo_urls = set()
  obj['image'] = []
  for photo in props.get('photo', []) + props.get('featured', []):
    url = photo
    alt = None
    if isinstance(photo, dict):
      photo = photo.get('properties') or photo
      url = get_first(photo, 'value') or get_first(photo, 'url')
      alt = get_first(photo, 'alt')
    if url and url not in photo_urls and is_absolute(url):
      photo_urls.add(url)
      obj['image'].append({'url': url, 'displayName': alt})

  # mf2util uses the indieweb/mf2 location algorithm to collect location properties.
  interpreted = mf2util.interpret({'items': [mf2]}, None)
  if interpreted:
    loc = interpreted.get('location')
    if loc:
      obj['location']['objectType'] = 'place'
      lat, lng = loc.get('latitude'), loc.get('longitude')
      if lat and lng:
        try:
          obj['location'].update({
            'latitude': float(lat),
            'longitude': float(lng),
          })
        except ValueError:
          logging.debug(
            'Could not convert latitude/longitude (%s, %s) to decimal', lat, lng)

  if as_type == 'activity':
    objects = []
    for target in itertools.chain.from_iterable(
        props.get(field, []) for field in (
          'follow-of', 'like', 'like-of', 'repost', 'repost-of', 'in-reply-to',
          'invitee')):
      t = json_to_object(target) if isinstance(target, dict) else {'url': target}
      # eliminate duplicates from redundant backcompat properties
      if t not in objects:
        objects.append(t)
    obj.update({
      'object': objects[0] if len(objects) == 1 else objects,
      'actor': author,
    })
    if as_verb == 'tag':
      obj['target'] = {'url': prop['tag-of']}
      if obj.get('object'):
        raise NotImplementedError(
          'Combined in-reply-to and tag-of is not yet supported.')
      obj['object'] = obj.pop('tags')
  else:
    obj.update({
      'inReplyTo': [{'url': url} for url in in_reply_tos],
      'author': author,
    })

  return source.Source.postprocess_object(obj)
Esempio n. 21
0
  def base_object(self, obj, verb=None, resolve_numeric_id=False):
    """Returns the 'base' silo object that an object operates on.

    This is mostly a big bag of heuristics for reverse engineering and
    parsing Facebook URLs. Whee.

    Args:
      obj: ActivityStreams object
      verb: string, optional
      resolve_numeric_id: if True, tries harder to populate the numeric_id field
        by making an additional API call to look up the object if necessary.

    Returns: dict, minimal ActivityStreams object. Usually has at least id,
      numeric_id, and url fields; may also have author.
    """
    base_obj = super(Facebook, self).base_object(obj)

    url = base_obj.get('url')
    if not url:
      return base_obj

    author = base_obj.setdefault('author', {})
    base_id = base_obj.get('id')
    if base_id and not base_obj.get('numeric_id'):
      if util.is_int(base_id):
        base_obj['numeric_id'] = base_id
      elif resolve_numeric_id:
        base_obj = self.user_to_actor(self.urlopen(base_id))

    try:
      parsed = urlparse.urlparse(url)
      params = urlparse.parse_qs(parsed.query)
      assert parsed.path.startswith('/')
      path = parsed.path.strip('/')
      path_parts = path.split('/')

      if len(path_parts) == 1:
        if not base_obj.get('objectType'):
          base_obj['objectType'] = 'person'  # or page
        if not base_id:
          base_id = base_obj['id'] = path_parts[0]
        # this is a gross hack - adding the FB username field to an AS object
        # and then re-running user_to_actor - but it's an easy/reusable way to
        # populate image, displayName, etc.
        if not base_obj.get('username') and not util.is_int(base_id):
          base_obj['username'] = base_id
        base_obj.update({k: v for k, v in self.user_to_actor(base_obj).items()
                         if k not in base_obj})

      elif len(path_parts) >= 3 and path_parts[1] == 'posts':
        author_id = path_parts[0]
        if not author.get('id'):
          author['id'] = author_id
        if util.is_int(author_id) and not author.get('numeric_id'):
          author['numeric_id'] = author_id

      # photo URLs look like:
      # https://www.facebook.com/photo.php?fbid=123&set=a.4.5.6&type=1
      # https://www.facebook.com/user/photos/a.12.34.56/78/?type=1&offset=0
      if path == 'photo.php':
        fbids = params.get('fbid')
        if fbids:
          base_obj['id'] = fbids[0]

      # photo album URLs look like this:
      # https://www.facebook.com/media/set/?set=a.12.34.56
      # c.f. http://stackoverflow.com/questions/18549744
      elif path == 'media/set':
        set_id = params.get('set')
        if set_id and set_id[0].startswith('a.'):
          base_obj['id'] = set_id[0].split('.')[1]

      comment_id = params.get('comment_id') or params.get('reply_comment_id')
      if comment_id:
        base_obj['id'] += '_' + comment_id[0]
        base_obj['objectType'] = 'comment'

      if '_' not in base_id and author.get('numeric_id'):
        # add author user id prefix. https://github.com/snarfed/bridgy/issues/229
        base_obj['id'] = '%s_%s' % (author['numeric_id'], base_id)

    except BaseException, e:
      logging.error(
        "Couldn't parse object URL %s : %s. Falling back to default logic.",
        url, e)
Esempio n. 22
0
  def get_activities_response(self, user_id=None, group_id=None, app_id=None,
                              activity_id=None, start_index=0, count=0,
                              etag=None, min_id=None, cache=None,
                              fetch_replies=False, fetch_likes=False,
                              fetch_shares=False, fetch_events=False,
                              fetch_mentions=False, search_query=None,
                              public_only=True, **kwargs):
    """Fetches issues and comments and converts them to ActivityStreams activities.

    See :meth:`Source.get_activities_response` for details.

    *Not comprehensive!* Uses the notifications API (v3 REST).

    Also note that start_index and count are not currently supported.

    https://developer.github.com/v3/activity/notifications/
    https://developer.github.com/v3/issues/
    https://developer.github.com/v3/issues/comments/

    fetch_likes determines whether emoji reactions are fetched:
    https://help.github.com/articles/about-conversations-on-github#reacting-to-ideas-in-comments

    The notifications API call supports Last-Modified/If-Modified-Since headers
    and 304 Not Changed responses. If provided, etag should be an RFC2822
    timestamp, usually the exact value returned in a Last-Modified header. It
    will also be passed to the comments API endpoint as the since= value
    (converted to ISO 8601).
    """
    if fetch_shares or fetch_events or fetch_mentions or search_query:
      raise NotImplementedError()

    since = None
    etag_parsed = email.utils.parsedate(etag)
    if etag_parsed:
      since = datetime.datetime(*etag_parsed[:6])

    activities = []

    if activity_id:
      parts = tuple(activity_id.split(':'))
      if len(parts) != 3:
        raise ValueError('GitHub activity ids must be of the form USER:REPO:ISSUE_OR_PR')
      try:
        issue = self.rest(REST_API_ISSUE % parts)
        activities = [self.issue_to_object(issue)]
      except BaseException as e:
        code, body = util.interpret_http_exception(e)
        if util.is_int(code) and int(code) in HTTP_NON_FATAL_CODES:
          activities = []
        else:
          raise

    else:
      resp = self.rest(REST_API_NOTIFICATIONS, parse_json=False,
                       headers={'If-Modified-Since': etag} if etag else None)
      etag = resp.headers.get('Last-Modified')
      notifs = [] if resp.status_code == 304 else json_loads(resp.text)

      for notif in notifs:
        id = notif.get('id')
        subject_url = notif.get('subject').get('url')
        if not subject_url:
          logging.info('Skipping thread %s, missing subject!', id)
          continue
        split = subject_url.split('/')
        if len(split) <= 2 or split[-2] not in ('issues', 'pulls'):
          logging.info(
            'Skipping thread %s with subject %s, only issues and PRs right now',
            id, subject_url)
          continue

        try:
          issue = self.rest(subject_url)
        except requests.HTTPError as e:
          if e.response.status_code in HTTP_NON_FATAL_CODES:
            util.interpret_http_exception(e)
            continue
          raise

        obj = self.issue_to_object(issue)

        private = notif.get('repository', {}).get('private')
        if private is not None:
          obj['to'] = [{
            'objectType': 'group',
            'alias': '@private' if private else '@public',
          }]

        comments_url = issue.get('comments_url')
        if fetch_replies and comments_url:
          if since:
            comments_url += '?since=%s' % since.isoformat() + 'Z'
          comments = self.rest(comments_url)
          comment_objs = list(util.trim_nulls(
            self.comment_to_object(c) for c in comments))
          obj['replies'] = {
            'items': comment_objs,
            'totalItems': len(comment_objs),
          }

        if fetch_likes:
          issue_url = issue['url'].replace('pulls', 'issues')
          reactions = self.rest(issue_url + '/reactions')
          obj.setdefault('tags', []).extend(
            self.reaction_to_object(r, obj) for r in reactions)

        activities.append(obj)

    response = self.make_activities_base_response(util.trim_nulls(activities))
    response['etag'] = etag
    return response