Ejemplo n.º 1
0
    def post(self, namespace):
        """Reads body with items to upload and replies with URLs to upload to."""
        if not re.match(r"^%s$" % model.NAMESPACE_RE, namespace):
            self.send_error('Invalid namespace; allowed keys must pass regexp "%s"' % model.NAMESPACE_RE)

        # Parse a body into list of EntryInfo objects.
        try:
            entries = self.parse_request(self.request.body)
        except ValueError as err:
            return self.send_error("Bad /pre-upload request.\n(%s)\n%s" % (err, self.request.body[:200]))

        # Generate push_urls for missing entries.
        push_urls = {}
        existing = []
        for entry_info, exists in self.check_entry_infos(entries, namespace):
            if exists:
                existing.append(entry_info)
            else:
                push_urls[entry_info.digest] = self.generate_push_urls(entry_info, namespace)

        # Send back the response.
        self.send_json([push_urls.get(entry_info.digest) for entry_info in entries])

        # Log stats, enqueue tagging task that updates last access time.
        stats.add_entry(stats.LOOKUP, len(entries), len(existing))
        if existing:
            # Ignore errors in a call below. They happen when task queue service has
            # a bad time and doesn't accept tagging tasks. We don't want isolate
            # server's reliability to depend on task queue service health. An ignored
            # error here means there's a chance some entry might be deleted sooner
            # than it should.
            self.tag_entries(existing, namespace)
Ejemplo n.º 2
0
    def retrieve(self, request):
        """Retrieves content from a storage location."""
        content = None
        key = None
        offset = request.offset
        if not request.digest:
            raise endpoints.BadRequestException('digest is required.')
        if not request.namespace:
            raise endpoints.BadRequestException('namespace is required.')
        # Try memcache.
        memcache_entry = memcache.get(request.digest,
                                      namespace='table_%s' %
                                      request.namespace.namespace)
        if memcache_entry is not None:
            content = memcache_entry
            found = 'memcache'
        else:
            key = entry_key_or_error(request.namespace.namespace,
                                     request.digest)
            stored = key.get()
            if stored is None:
                logging.debug('%s', request.digest)
                raise endpoints.NotFoundException(
                    'Unable to retrieve the entry.')
            content = stored.content  # will be None if entity is in GCS
            found = 'inline'

        # Return and log stats here if something has been found.
        if content is not None:
            # make sure that offset is acceptable
            logging.debug('%s', request.digest)
            if offset < 0 or offset > len(content):
                raise endpoints.BadRequestException(
                    'Invalid offset %d. Offset must be between 0 and content length.'
                    % offset)
            stats.add_entry(stats.RETURN, len(content) - offset, found)
            return RetrievedContent(content=content[offset:])

        # The data is in GS; log stats and return the URL.
        if offset < 0 or offset > stored.compressed_size:
            logging.debug('%s', request.digest)
            raise endpoints.BadRequestException(
                'Invalid offset %d. Offset must be between 0 and content length.'
                % offset)
        metrics.file_size(stored.compressed_size - offset)
        stats.add_entry(stats.RETURN, stored.compressed_size - offset,
                        'GS; %s' % stored.key.id())
        return RetrievedContent(url=self.gs_url_signer.get_download_url(
            filename=key.id(), expiration=DEFAULT_LINK_EXPIRATION))
  def retrieve(self, request):
    """Retrieves content from a storage location."""
    content = None
    key = None
    offset = request.offset

    # try the memcache
    memcache_entry = memcache.get(
        request.digest, namespace='table_%s' % request.namespace.namespace)
    if memcache_entry is not None:
      content = memcache_entry
      found = 'memcache'
    else:
      key = entry_key_or_error(request.namespace.namespace, request.digest)
      stored = key.get()
      if stored is None:
        logging.debug('%s', request.digest)
        raise endpoints.NotFoundException('Unable to retrieve the entry.')
      content = stored.content  # will be None if entity is in GCS
      found = 'inline'

    # Return and log stats here if something has been found.
    if content is not None:
      # make sure that offset is acceptable
      logging.debug('%s', request.digest)
      if offset < 0 or offset > len(content):
        raise endpoints.BadRequestException(
            'Invalid offset %d. Offset must be between 0 and content length.' %
            offset)
      stats.add_entry(stats.RETURN, len(content) - offset, found)
      return RetrievedContent(content=content[offset:])

    # The data is in GS; log stats and return the URL.
    if offset < 0 or offset > stored.compressed_size:
      logging.debug('%s', request.digest)
      raise endpoints.BadRequestException(
          'Invalid offset %d. Offset must be between 0 and content length.' %
          offset)
    stats.add_entry(
        stats.RETURN,
        stored.compressed_size - offset,
        'GS; %s' % stored.key.id())
    return RetrievedContent(url=self.gs_url_signer.get_download_url(
        filename=key.id(),
        expiration=DEFAULT_LINK_EXPIRATION))
Ejemplo n.º 4
0
    def post(self, namespace):
        """Reads body with items to upload and replies with URLs to upload to."""
        logging.error('Unexpected old client')
        if not re.match(r'^%s$' % model.NAMESPACE_RE, namespace):
            self.send_error(
                'Invalid namespace; allowed keys must pass regexp "%s"' %
                model.NAMESPACE_RE)

        # Parse a body into list of EntryInfo objects.
        try:
            entries = self.parse_request(self.request.body)
        except ValueError as err:
            return self.send_error('Bad /pre-upload request.\n(%s)\n%s' %
                                   (err, self.request.body[:200]))

        # Generate push_urls for missing entries.
        push_urls = {}
        existing = []
        for entry_info, exists in self.check_entry_infos(entries, namespace):
            if exists:
                existing.append(entry_info)
            else:
                push_urls[entry_info.digest] = self.generate_push_urls(
                    entry_info, namespace)

        # Send back the response.
        self.send_json(
            [push_urls.get(entry_info.digest) for entry_info in entries])

        # Log stats, enqueue tagging task that updates last access time.
        stats.add_entry(stats.LOOKUP, len(entries), len(existing))
        if existing:
            # Ignore errors in a call below. They happen when task queue service has
            # a bad time and doesn't accept tagging tasks. We don't want isolate
            # server's reliability to depend on task queue service health. An ignored
            # error here means there's a chance some entry might be deleted sooner
            # than it should.
            self.tag_entries(existing, namespace)
Ejemplo n.º 5
0
    def get(self, namespace, hash_key):  #pylint: disable=W0221
        logging.error('Unexpected old client')
        # Parse 'Range' header if it's present to extract initial offset.
        # Only support single continuous range from some |offset| to the end.
        offset = 0
        range_header = self.request.headers.get('range')
        if range_header:
            match = re.match(r'bytes=(\d+)-', range_header)
            if not match:
                return self.send_error('Unsupported byte range.\n\'%s\'.' %
                                       range_header,
                                       http_code=416)
            offset = int(match.group(1))

        memcache_entry = memcache.get(hash_key,
                                      namespace='table_%s' % namespace)
        if memcache_entry is not None:
            self.send_data(memcache_entry, filename=hash_key, offset=offset)
            stats.add_entry(stats.RETURN,
                            len(memcache_entry) - offset, 'memcache')
            return

        entry = model.get_entry_key(namespace, hash_key).get()
        if not entry:
            return self.send_error('Unable to retrieve the entry.',
                                   http_code=404)

        if entry.content is not None:
            self.send_data(entry.content, filename=hash_key, offset=offset)
            stats.add_entry(stats.RETURN,
                            len(entry.content) - offset, 'inline')
            return

        # Generate signed download URL.
        settings = config.settings()
        # TODO(maruel): The GS object may not exist anymore. Handle this.
        signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email,
                               settings.gs_private_key)
        # The entry key is the GS filepath.
        signed_url = signer.get_download_url(entry.key.id())

        # Redirect client to this URL. If 'Range' header is used, client will
        # correctly pass it to Google Storage to fetch only subrange of file,
        # so update stats accordingly.
        self.redirect(signed_url)
        stats.add_entry(stats.RETURN, entry.compressed_size - offset,
                        'GS; %s' % entry.key.id())
Ejemplo n.º 6
0
  def get(self, namespace, hash_key):  #pylint: disable=W0221
    logging.error('Unexpected old client')
    # Parse 'Range' header if it's present to extract initial offset.
    # Only support single continuous range from some |offset| to the end.
    offset = 0
    range_header = self.request.headers.get('range')
    if range_header:
      match = re.match(r'bytes=(\d+)-', range_header)
      if not match:
        return self.send_error(
            'Unsupported byte range.\n\'%s\'.' % range_header, http_code=416)
      offset = int(match.group(1))

    memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace)
    if memcache_entry is not None:
      self.send_data(memcache_entry, filename=hash_key, offset=offset)
      stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache')
      return

    entry = model.get_entry_key(namespace, hash_key).get()
    if not entry:
      return self.send_error('Unable to retrieve the entry.', http_code=404)

    if entry.content is not None:
      self.send_data(entry.content, filename=hash_key, offset=offset)
      stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline')
      return

    # Generate signed download URL.
    settings = config.settings()
    # TODO(maruel): The GS object may not exist anymore. Handle this.
    signer = gcs.URLSigner(settings.gs_bucket,
        settings.gs_client_id_email, settings.gs_private_key)
    # The entry key is the GS filepath.
    signed_url = signer.get_download_url(entry.key.id())

    # Redirect client to this URL. If 'Range' header is used, client will
    # correctly pass it to Google Storage to fetch only subrange of file,
    # so update stats accordingly.
    self.redirect(signed_url)
    stats.add_entry(
        stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
Ejemplo n.º 7
0
    def storage_helper(request, uploaded_to_gs):
        """Implement shared logic between store_inline and finalize_gs.

    Arguments:
      request: either StorageRequest or FinalizeRequest.
      uploaded_to_gs: bool.
    """
        if not request.upload_ticket:
            raise endpoints.BadRequestException(
                'Upload ticket was empty or not provided.')
        try:
            embedded = TokenSigner.validate(request.upload_ticket,
                                            UPLOAD_MESSAGES[uploaded_to_gs])
        except (auth.InvalidTokenError, ValueError) as error:
            raise endpoints.BadRequestException(
                'Ticket validation failed: %s' % error.message)

        digest = embedded['d'].encode('utf-8')
        is_isolated = bool(int(embedded['i']))
        namespace = embedded['n']
        size = int(embedded['s'])
        key = entry_key_or_error(namespace, digest)

        if uploaded_to_gs:
            # Ensure that file info is uploaded to GS first.
            file_info = gcs.get_file_info(config.settings().gs_bucket,
                                          key.id())
            if not file_info:
                logging.debug('%s', digest)
                raise endpoints.BadRequestException(
                    'File should be in Google Storage.\nFile: \'%s\' Size: %d.'
                    % (key.id(), size))
            content = None
            compressed_size = file_info.size
        else:
            content = request.content
            compressed_size = len(content)

        # Look if the entity was already stored. Alert in that case but ignore it.
        if key.get():
            # TODO(maruel): Handle these more gracefully.
            logging.warning('Overwritting ContentEntry\n%s', digest)

        entry = model.new_content_entry(
            key=key,
            is_isolated=is_isolated,
            compressed_size=compressed_size,
            expanded_size=size,
            is_verified=not uploaded_to_gs,
            content=content,
        )

        if not uploaded_to_gs:
            # Assert that embedded content is the data sent by the request.
            logging.debug('%s', digest)
            if (digest, size) != hash_content(content, namespace):
                raise endpoints.BadRequestException(
                    'Embedded digest does not match provided data: '
                    '(digest, size): (%r, %r); expected: %r' %
                    (digest, size, hash_content(content, namespace)))
            try:
                entry.put()
            except datastore_errors.Error as e:
                raise endpoints.InternalServerErrorException(
                    'Unable to store the entity: %s.' % e.__class__.__name__)
        else:
            # Enqueue verification task transactionally as the entity is stored.
            try:
                store_and_enqueue_verify_task(entry,
                                              utils.get_task_queue_host())
            except (datastore_errors.Error,
                    runtime.apiproxy_errors.CancelledError,
                    runtime.apiproxy_errors.DeadlineExceededError,
                    runtime.apiproxy_errors.OverQuotaError,
                    runtime.DeadlineExceededError, taskqueue.Error) as e:
                raise endpoints.InternalServerErrorException(
                    'Unable to store the entity: %s.' % e.__class__.__name__)

        stats.add_entry(
            stats.STORE, entry.compressed_size,
            'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline')
        return PushPing(ok=True)
Ejemplo n.º 8
0
 def get(self):
     """Generates fake stats."""
     stats.add_entry(stats.DUPE, 1024, 'inline')
     self.response.write('Yay')
Ejemplo n.º 9
0
 def get(self):
     """Generates fake stats."""
     stats.add_entry(stats.LOOKUP, 200, 103)
     self.response.write('Yay')
Ejemplo n.º 10
0
 def get(self):
     """Generates fake stats."""
     stats.add_entry(stats.RETURN, 4096, 'memcache')
     self.response.write('Yay')
Ejemplo n.º 11
0
 def get(self):
     """Generates fake stats."""
     stats.add_entry(stats.STORE, 2048, 'GS; inline')
     self.response.write('Yay')
Ejemplo n.º 12
0
  def handle(self, namespace, hash_key):
    """Handles this request."""
    # Extract relevant request parameters.
    expiration_ts = self.request.get('x')
    item_size = self.request.get('s')
    is_isolated = self.request.get('i')
    uploaded_to_gs = self.request.get('g')
    signature = self.request.get('sig')

    # Build correct signature.
    expected_sig = self.generate_signature(
        config.settings().global_secret, self.request.method, expiration_ts,
        namespace, hash_key, item_size, is_isolated, uploaded_to_gs)

    # Verify signature is correct.
    if not utils.constant_time_equals(signature, expected_sig):
      return self.send_error('Incorrect signature.')

    # Convert parameters from strings back to something useful.
    # It can't fail since matching signature means it was us who generated
    # this strings in a first place.
    expiration_ts = int(expiration_ts)
    item_size = int(item_size)
    is_isolated = bool(int(is_isolated))
    uploaded_to_gs = bool(int(uploaded_to_gs))

    # Verify signature is not yet expired.
    if time.time() > expiration_ts:
      return self.send_error('Expired signature.')

    if uploaded_to_gs:
      # GS upload finalization uses empty POST body.
      assert self.request.method == 'POST'
      if self.request.headers.get('content-length'):
        return self.send_error('Expecting empty POST.')
      content = None
    else:
      # Datastore upload uses PUT.
      assert self.request.method == 'PUT'
      if self.request.headers.get('content-length'):
        content = self.request.body
      else:
        content = ''

    # Info about corresponding GS entry (if it exists).
    gs_bucket = config.settings().gs_bucket
    key = model.entry_key(namespace, hash_key)

    # Verify the data while at it since it's already in memory but before
    # storing it in memcache and datastore.
    if content is not None:
      # Verify advertised hash matches the data.
      try:
        hex_digest, expanded_size = hash_content(content, namespace)
        if hex_digest != hash_key:
          raise ValueError(
              'Hash and data do not match, '
              '%d bytes (%d bytes expanded)' % (len(content), expanded_size))
        if expanded_size != item_size:
          raise ValueError(
              'Advertised data length (%d) and actual data length (%d) '
              'do not match' % (item_size, expanded_size))
      except ValueError as err:
        return self.send_error('Inline verification failed.\n%s' % err)
      # Successfully verified!
      compressed_size = len(content)
      needs_verification = False
    else:
      # Fetch size of the stored file.
      file_info = gcs.get_file_info(gs_bucket, key.id())
      if not file_info:
        # TODO(maruel): Do not fail yet. If the request got up to here, the file
        # is likely there but the service may have trouble fetching the metadata
        # from GS.
        return self.send_error(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' %
            (key.id(), item_size))
      compressed_size = file_info.size
      needs_verification = True

    # Data is here and it's too large for DS, so put it in GS. It is likely
    # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
    if content is not None and len(content) >= MIN_SIZE_FOR_GS:
      if not gcs.write_file(gs_bucket, key.id(), [content]):
        # Returns 503 so the client automatically retries.
        return self.send_error(
            'Unable to save the content to GS.', http_code=503)
      # It's now in GS.
      uploaded_to_gs = True

    # Can create entity now, everything appears to be legit.
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=-1 if needs_verification else item_size,
        is_verified = not needs_verification)

    # If it's not in GS then put it inline.
    if not uploaded_to_gs:
      assert content is not None and len(content) < MIN_SIZE_FOR_GS
      entry.content = content

    # Start saving *.isolated into memcache iff its content is available and
    # it's not in Datastore: there's no point in saving inline blobs in memcache
    # because ndb already memcaches them.
    memcache_store_future = None
    if (content is not None and
        entry.content is None and
        entry.is_isolated and
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
      memcache_store_future = model.save_in_memcache(
          namespace, hash_key, content, async=True)

    try:
      # If entry was already verified above (i.e. it is a small inline entry),
      # store it right away, possibly overriding existing entity. Most of
      # the time it is a new entry anyway (since clients try to upload only
      # new entries).
      if not needs_verification:
        entry.put()
      else:
        # For large entries (that require expensive verification) be more
        # careful and check that it is indeed a new entity. No need to do it in
        # transaction: a race condition would lead to redundant verification
        # task enqueued, no big deal.
        existing = entry.key.get()
        if existing:
          if existing.is_verified:
            logging.info('Entity exists and already verified')
          else:
            logging.info('Entity exists, but not yet verified')
        else:
          # New entity. Store it and enqueue verification task, transactionally.
          task_queue_host = utils.get_task_queue_host()
          def run():
            entry.put()
            taskqueue.add(
                url='/internal/taskqueue/verify/%s' % entry.key.id(),
                queue_name='verify',
                headers={'Host': task_queue_host},
                transactional=True)
          datastore_utils.transaction(run)

      # TODO(vadimsh): Fill in details about the entry, such as expiration time.
      self.send_json({'entry': {}})

      # Log stats.
      where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id()
      stats.add_entry(stats.STORE, entry.compressed_size, where)

    finally:
      # Do not keep dangling futures. Note that error here is ignored,
      # memcache is just an optimization.
      if memcache_store_future:
        memcache_store_future.wait()
Ejemplo n.º 13
0
 def get(self):
   """Generates fake stats."""
   stats.add_entry(stats.STORE, 2048, 'GS; inline')
   self.response.write('Yay')
Ejemplo n.º 14
0
  def storage_helper(request, uploaded_to_gs):
    """Implement shared logic between store_inline and finalize_gs.

    Arguments:
      request: either StorageRequest or FinalizeRequest.
      uploaded_to_gs: bool.
    """
    if not request.upload_ticket:
      raise endpoints.BadRequestException(
          'Upload ticket was empty or not provided.')
    try:
      embedded = TokenSigner.validate(
          request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs])
    except (auth.InvalidTokenError, ValueError) as error:
      raise endpoints.BadRequestException(
          'Ticket validation failed: %s' % error.message)

    digest = embedded['d'].encode('utf-8')
    is_isolated = bool(int(embedded['i']))
    namespace = embedded['n']
    size = int(embedded['s'])
    key = entry_key_or_error(namespace, digest)

    if uploaded_to_gs:
      # Ensure that file info is uploaded to GS first.
      file_info = gcs.get_file_info(config.settings().gs_bucket, key.id())
      if not file_info:
        logging.debug('%s', digest)
        raise endpoints.BadRequestException(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (
                key.id(), size))
      content = None
      compressed_size = file_info.size
    else:
      content = request.content
      compressed_size = len(content)

    # Look if the entity was already stored. Alert in that case but ignore it.
    if key.get():
      # TODO(maruel): Handle these more gracefully.
      logging.warning('Overwritting ContentEntry\n%s', digest)

    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=size,
        is_verified=not uploaded_to_gs,
        content=content,
    )

    if not uploaded_to_gs:
      # Assert that embedded content is the data sent by the request.
      logging.debug('%s', digest)
      if (digest, size) != hash_content(content, namespace):
        raise endpoints.BadRequestException(
            'Embedded digest does not match provided data: '
            '(digest, size): (%r, %r); expected: %r' % (
                digest, size, hash_content(content, namespace)))
      entry.put()
    else:
      # Enqueue verification task transactionally as the entity is stored.
      try:
        store_and_enqueue_verify_task(entry, utils.get_task_queue_host())
      except (
          datastore_errors.Error,
          runtime.apiproxy_errors.CancelledError,
          runtime.apiproxy_errors.DeadlineExceededError,
          runtime.apiproxy_errors.OverQuotaError,
          runtime.DeadlineExceededError,
          taskqueue.Error) as e:
        raise endpoints.InternalServerErrorException(
            'Unable to store the entity: %s.' % e.__class__.__name__)

    stats.add_entry(
        stats.STORE, entry.compressed_size,
        'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline')
    return PushPing(ok=True)
Ejemplo n.º 15
0
 def get(self):
   """Generates fake stats."""
   stats.add_entry(stats.DUPE, 1024, 'inline')
   self.response.write('Yay')
Ejemplo n.º 16
0
 def get(self):
   """Generates fake stats."""
   stats.add_entry(stats.LOOKUP, 200, 103)
   self.response.write('Yay')
Ejemplo n.º 17
0
 def get(self):
   """Generates fake stats."""
   stats.add_entry(stats.RETURN, 4096, 'memcache')
   self.response.write('Yay')
Ejemplo n.º 18
0
  def preupload(self, request):
    """Checks for entry's existence and generates upload URLs.

    Arguments:
      request: the DigestCollection to be posted

    Returns:
      the UrlCollection corresponding to the uploaded digests

    The response list is commensurate to the request's; each UrlMessage has
      * if an entry is missing: two URLs: the URL to upload a file
        to and the URL to call when the upload is done (can be null).
      * if the entry is already present: null URLs ('').

    UrlCollection([
        UrlMessage(
          upload_url = "<upload url>"
          finalize_url = "<finalize url>"
          )
        UrlMessage(
          upload_url = '')
        ...
        ])
    """
    response = UrlCollection(items=[])

    # check for namespace error
    if not re.match(r'^%s$' % model.NAMESPACE_RE, request.namespace.namespace):
      raise endpoints.BadRequestException(
          'Invalid namespace; allowed keys must pass regexp "%s"' %
          model.NAMESPACE_RE)

    if len(request.items) > 1000:
      raise endpoints.BadRequestException(
          'Only up to 1000 items can be looked up at once')

    # check for existing elements
    new_digests, existing_digests = self.partition_collection(request)

    # process all elements; add an upload ticket for cache misses
    for index, digest_element in enumerate(request.items):
      # check for error conditions
      if not model.is_valid_hex(digest_element.digest):
        raise endpoints.BadRequestException(
            'Invalid hex code: %s' % (digest_element.digest))

      if digest_element in new_digests:
        # generate preupload ticket
        status = PreuploadStatus(
            index=index,
            upload_ticket=self.generate_ticket(
                digest_element, request.namespace))

        # generate GS upload URL if necessary
        if self.should_push_to_gs(digest_element):
          key = entry_key_or_error(
              request.namespace.namespace, digest_element.digest)
          status.gs_upload_url = self.gs_url_signer.get_upload_url(
              filename=key.id(),
              content_type='application/octet-stream',
              expiration=DEFAULT_LINK_EXPIRATION)

        response.items.append(status)

    # Tag existing entities and collect stats.
    self.tag_existing(DigestCollection(
        items=list(existing_digests), namespace=request.namespace))
    stats.add_entry(stats.LOOKUP, len(request.items), len(existing_digests))
    return response
Ejemplo n.º 19
0
  def storage_helper(self, request, uploaded_to_gs):
    """Implement shared logic between store_inline and finalize_gs."""
    # validate token or error out
    if not request.upload_ticket:
      raise endpoints.BadRequestException(
          'Upload ticket was empty or not provided.')
    try:
      embedded = TokenSigner.validate(
          request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs])
    except (auth.InvalidTokenError, ValueError) as error:
      raise endpoints.BadRequestException(
          'Ticket validation failed: %s' % error.message)

    # read data and convert types
    digest = embedded['d'].encode('utf-8')
    is_isolated = bool(int(embedded['i']))
    namespace = embedded['n']
    size = int(embedded['s'])

    # create a key
    key = entry_key_or_error(namespace, digest)

    # get content and compressed size
    if uploaded_to_gs:
      # ensure that file info is uploaded to GS first
      # TODO(cmassaro): address analogous TODO from handlers_api
      file_info = gcs.get_file_info(config.settings().gs_bucket, key.id())
      if not file_info:
        raise endpoints.BadRequestException(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (
                key.id(), size))
      content = None
      compressed_size = file_info.size
    else:
      content = request.content
      compressed_size = len(content)

    # all is well; create an entry
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=size,
        is_verified=not uploaded_to_gs,
        content=content,
    )

    # DB: assert that embedded content is the data sent by the request
    if not uploaded_to_gs:
      if (digest, size) != hash_content(content, namespace):
        raise endpoints.BadRequestException(
            'Embedded digest does not match provided data: '
            '(digest, size): (%r, %r); expected: %r' % (
                digest, size, hash_content(content, namespace)))
      entry.put()

    # GCS: enqueue verification task
    else:
      try:
        store_and_enqueue_verify_task(entry, utils.get_task_queue_host())
      except (
          datastore_errors.Error,
          runtime.apiproxy_errors.CancelledError,
          runtime.apiproxy_errors.DeadlineExceededError,
          runtime.apiproxy_errors.OverQuotaError,
          runtime.DeadlineExceededError,
          taskqueue.Error) as e:
        raise endpoints.InternalServerErrorException(
            'Unable to store the entity: %s.' % e.__class__.__name__)

    stats.add_entry(
        stats.STORE, entry.compressed_size,
        'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline')
    return PushPing(ok=True)
Ejemplo n.º 20
0
    def handle(self, namespace, hash_key):
        """Handles this request."""
        # Extract relevant request parameters.
        expiration_ts = self.request.get('x')
        item_size = self.request.get('s')
        is_isolated = self.request.get('i')
        uploaded_to_gs = self.request.get('g')
        signature = self.request.get('sig')

        # Build correct signature.
        expected_sig = self.generate_signature(config.settings().global_secret,
                                               self.request.method,
                                               expiration_ts, namespace,
                                               hash_key, item_size,
                                               is_isolated, uploaded_to_gs)

        # Verify signature is correct.
        if not utils.constant_time_equals(signature, expected_sig):
            return self.send_error('Incorrect signature.')

        # Convert parameters from strings back to something useful.
        # It can't fail since matching signature means it was us who generated
        # this strings in a first place.
        expiration_ts = int(expiration_ts)
        item_size = int(item_size)
        is_isolated = bool(int(is_isolated))
        uploaded_to_gs = bool(int(uploaded_to_gs))

        # Verify signature is not yet expired.
        if time.time() > expiration_ts:
            return self.send_error('Expired signature.')

        if uploaded_to_gs:
            # GS upload finalization uses empty POST body.
            assert self.request.method == 'POST'
            if self.request.headers.get('content-length'):
                return self.send_error('Expecting empty POST.')
            content = None
        else:
            # Datastore upload uses PUT.
            assert self.request.method == 'PUT'
            if self.request.headers.get('content-length'):
                content = self.request.body
            else:
                content = ''

        # Info about corresponding GS entry (if it exists).
        gs_bucket = config.settings().gs_bucket
        key = model.entry_key(namespace, hash_key)

        # Verify the data while at it since it's already in memory but before
        # storing it in memcache and datastore.
        if content is not None:
            # Verify advertised hash matches the data.
            try:
                hex_digest, expanded_size = hash_content(content, namespace)
                if hex_digest != hash_key:
                    raise ValueError('Hash and data do not match, '
                                     '%d bytes (%d bytes expanded)' %
                                     (len(content), expanded_size))
                if expanded_size != item_size:
                    raise ValueError(
                        'Advertised data length (%d) and actual data length (%d) '
                        'do not match' % (item_size, expanded_size))
            except ValueError as err:
                return self.send_error('Inline verification failed.\n%s' % err)
            # Successfully verified!
            compressed_size = len(content)
            needs_verification = False
        else:
            # Fetch size of the stored file.
            file_info = gcs.get_file_info(gs_bucket, key.id())
            if not file_info:
                # TODO(maruel): Do not fail yet. If the request got up to here, the file
                # is likely there but the service may have trouble fetching the metadata
                # from GS.
                return self.send_error(
                    'File should be in Google Storage.\nFile: \'%s\' Size: %d.'
                    % (key.id(), item_size))
            compressed_size = file_info.size
            needs_verification = True

        # Data is here and it's too large for DS, so put it in GS. It is likely
        # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
        if content is not None and len(content) >= MIN_SIZE_FOR_GS:
            if not gcs.write_file(gs_bucket, key.id(), [content]):
                # Returns 503 so the client automatically retries.
                return self.send_error('Unable to save the content to GS.',
                                       http_code=503)
            # It's now in GS.
            uploaded_to_gs = True

        # Can create entity now, everything appears to be legit.
        entry = model.new_content_entry(
            key=key,
            is_isolated=is_isolated,
            compressed_size=compressed_size,
            expanded_size=-1 if needs_verification else item_size,
            is_verified=not needs_verification)

        # If it's not in GS then put it inline.
        if not uploaded_to_gs:
            assert content is not None and len(content) < MIN_SIZE_FOR_GS
            entry.content = content

        # Start saving *.isolated into memcache iff its content is available and
        # it's not in Datastore: there's no point in saving inline blobs in memcache
        # because ndb already memcaches them.
        memcache_store_future = None
        if (content is not None and entry.content is None and entry.is_isolated
                and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
            memcache_store_future = model.save_in_memcache(namespace,
                                                           hash_key,
                                                           content,
                                                           async=True)

        try:
            # If entry was already verified above (i.e. it is a small inline entry),
            # store it right away, possibly overriding existing entity. Most of
            # the time it is a new entry anyway (since clients try to upload only
            # new entries).
            if not needs_verification:
                entry.put()
            else:
                # For large entries (that require expensive verification) be more
                # careful and check that it is indeed a new entity. No need to do it in
                # transaction: a race condition would lead to redundant verification
                # task enqueued, no big deal.
                existing = entry.key.get()
                if existing:
                    if existing.is_verified:
                        logging.info('Entity exists and already verified')
                    else:
                        logging.info('Entity exists, but not yet verified')
                else:
                    # New entity. Store it and enqueue verification task, transactionally.
                    task_queue_host = utils.get_task_queue_host()

                    def run():
                        entry.put()
                        taskqueue.add(url='/internal/taskqueue/verify/%s' %
                                      entry.key.id(),
                                      queue_name='verify',
                                      headers={'Host': task_queue_host},
                                      transactional=True)

                    datastore_utils.transaction(run)

            # TODO(vadimsh): Fill in details about the entry, such as expiration time.
            self.send_json({'entry': {}})

            # Log stats.
            where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id(
            )
            stats.add_entry(stats.STORE, entry.compressed_size, where)

        finally:
            # Do not keep dangling futures. Note that error here is ignored,
            # memcache is just an optimization.
            if memcache_store_future:
                memcache_store_future.wait()