Example #1
0
    def generate_push_urls(self, entry_info, namespace):
        """Generates a pair of URLs to be used by clients to upload an item.

    The GS filename is exactly ContentEntry.key.id().

    URL's being generated are 'upload URL' and 'finalize URL'. Client uploads
    an item to upload URL (via PUT request) and then POST status of the upload
    to a finalize URL.

    Finalize URL may be optional (it's None in that case).
    """
        if self.should_push_to_gs(entry_info):
            # Store larger stuff in Google Storage.
            key = model.entry_key(namespace, entry_info.digest)
            upload_url = self.gs_url_signer.get_upload_url(
                filename=key.id(),
                content_type='application/octet-stream',
                expiration=self.DEFAULT_LINK_EXPIRATION)
            finalize_url = self.generate_store_url(
                entry_info,
                namespace,
                http_verb='POST',
                uploaded_to_gs=True,
                expiration=self.DEFAULT_LINK_EXPIRATION)
        else:
            # Store smallish entries and *.isolated in Datastore directly.
            upload_url = self.generate_store_url(
                entry_info,
                namespace,
                http_verb='PUT',
                uploaded_to_gs=False,
                expiration=self.DEFAULT_LINK_EXPIRATION)
            finalize_url = None
        return upload_url, finalize_url
Example #2
0
  def post(self, namespace, timestamp):
    digests = []
    now = utils.timestamp_to_datetime(long(timestamp))
    expiration = config.settings().default_expiration
    try:
      digests = payload_to_hashes(self, namespace)
      # Requests all the entities at once.
      futures = ndb.get_multi_async(
          model.entry_key(namespace, binascii.hexlify(d)) for d in digests)

      to_save = []
      while futures:
        # Return opportunistically the first entity that can be retrieved.
        future = ndb.Future.wait_any(futures)
        futures.remove(future)
        item = future.get_result()
        if item and item.next_tag_ts < now:
          # Update the timestamp. Add a bit of pseudo randomness.
          item.expiration_ts, item.next_tag_ts = model.expiration_jitter(
              now, expiration)
          to_save.append(item)
      if to_save:
        ndb.put_multi(to_save)
      logging.info(
          'Timestamped %d entries out of %s', len(to_save), len(digests))
    except Exception as e:
      logging.error('Failed to stamp entries: %s\n%d entries', e, len(digests))
      raise
Example #3
0
  def generate_push_urls(self, entry_info, namespace):
    """Generates a pair of URLs to be used by clients to upload an item.

    The GS filename is exactly ContentEntry.key.id().

    URL's being generated are 'upload URL' and 'finalize URL'. Client uploads
    an item to upload URL (via PUT request) and then POST status of the upload
    to a finalize URL.

    Finalize URL may be optional (it's None in that case).
    """
    if self.should_push_to_gs(entry_info):
      # Store larger stuff in Google Storage.
      key = model.entry_key(namespace, entry_info.digest)
      upload_url = self.gs_url_signer.get_upload_url(
          filename=key.id(),
          content_type='application/octet-stream',
          expiration=self.DEFAULT_LINK_EXPIRATION)
      finalize_url = self.generate_store_url(
          entry_info, namespace,
          http_verb='POST',
          uploaded_to_gs=True,
          expiration=self.DEFAULT_LINK_EXPIRATION)
    else:
      # Store smallish entries and *.isolated in Datastore directly.
      upload_url = self.generate_store_url(
          entry_info, namespace,
          http_verb='PUT',
          uploaded_to_gs=False,
          expiration=self.DEFAULT_LINK_EXPIRATION)
      finalize_url = None
    return upload_url, finalize_url
Example #4
0
    def post(self, namespace, timestamp):
        digests = []
        now = utils.timestamp_to_datetime(long(timestamp))
        expiration = config.settings().default_expiration
        try:
            digests = payload_to_hashes(self, namespace)
            # Requests all the entities at once.
            futures = ndb.get_multi_async(
                model.entry_key(namespace, binascii.hexlify(d))
                for d in digests)

            to_save = []
            while futures:
                # Return opportunistically the first entity that can be retrieved.
                future = ndb.Future.wait_any(futures)
                futures.remove(future)
                item = future.get_result()
                if item and item.next_tag_ts < now:
                    # Update the timestamp. Add a bit of pseudo randomness.
                    item.expiration_ts, item.next_tag_ts = model.expiration_jitter(
                        now, expiration)
                    to_save.append(item)
            if to_save:
                ndb.put_multi(to_save)
            logging.info('Timestamped %d entries out of %s', len(to_save),
                         len(digests))
        except Exception as e:
            logging.error('Failed to stamp entries: %s\n%d entries', e,
                          len(digests))
            raise
  def test_finalize_gs_creates_content_entry(self):
    """Assert that finalize_gs_upload creates a content entry."""
    content = pad_string('empathy')
    request = self.store_request(content)
    embedded = validate(
        request.upload_ticket, handlers_endpoints.UPLOAD_MESSAGES[1])
    key = model.entry_key(embedded['n'], embedded['d'])

    # finalize_gs_upload should put a new ContentEntry into the database
    self.mock(gcs, 'get_file_info', get_file_info_factory(content))
    self.call_api(
        'finalize_gs_upload', self.message_to_dict(request), 200)
    stored = key.get()
    self.assertEqual(key, stored.key)

    # assert that expected attributes are present
    self.assertEqual(None, stored.content)
    self.assertEqual(int(embedded['s']), stored.expanded_size)

    # ensure that verification occurs
    self.mock(gcs, 'read_file', lambda _bucket, _key: content)

    # add a side effect in execute_tasks()
    # TODO(cmassaro): there must be a better way than this
    def set_verified():
      stored_entry = stored.key.get()
      if not stored_entry.is_verified:
        stored_entry.is_verified = True
    self.mock_side_effect(self._taskqueue_stub, 'DeleteTask', set_verified)

    # assert that verification occurs in the taskqueue
    self.assertFalse(stored.key.get().is_verified)
    self.assertEqual(1, self.execute_tasks())
    self.assertTrue(stored.key.get().is_verified)
Example #6
0
 def gen_content(namespace='default', content='Foo'):
   h = model.get_hash_algo(namespace)
   h.update(content)
   hashhex = h.hexdigest()
   key = model.entry_key(namespace, hashhex)
   model.new_content_entry(
       key,
       is_isolated=False,
       content=content,
       compressed_size=len(content),
       expanded_size=len(content),
       is_verified=True).put()
   return hashhex
  def test_check_existing_enqueues_tasks(self):
    """Assert that existent entities are enqueued."""
    collection = handlers_endpoints.DigestCollection(
        namespace=handlers_endpoints.Namespace())
    collection.items.append(
        generate_digest('some content', collection.namespace))
    key = model.entry_key(
        collection.namespace.namespace, collection.items[0].digest)

    # guarantee that one digest already exists in the datastore
    model.new_content_entry(key).put()
    self.call_api(
        'preupload', self.message_to_dict(collection), 200)

    # find enqueued tasks
    enqueued_tasks = self.execute_tasks()
    self.assertEqual(1, enqueued_tasks)
  def test_store_inline_empty_content(self):
    """Assert that inline content storage works when content is empty."""
    request = self.store_request('')
    embedded = validate(
        request.upload_ticket, handlers_endpoints.UPLOAD_MESSAGES[0])
    key = model.entry_key(embedded['n'], embedded['d'])

    # assert that store_inline puts the correct entity into the datastore
    self.call_api(
        'store_inline', self.message_to_dict(request), 200)
    stored = key.get()
    self.assertEqual(key, stored.key)

    # assert that expected (digest, size) pair is generated by stored content
    self.assertEqual(
        (embedded['d'].encode('utf-8'), int(embedded['s'])),
        handlers_endpoints.hash_content(stored.content, embedded['n']))
Example #9
0
    def check_entry_infos(entries, namespace):
        """Generator that checks for EntryInfo entries existence.

    Yields pairs (EntryInfo object, True if such entry exists in Datastore).
    """
        # Kick off all queries in parallel. Build mapping Future -> digest.
        futures = {}
        for entry_info in entries:
            key = model.entry_key(namespace, entry_info.digest)
            futures[key.get_async(use_cache=False)] = entry_info

        # Pick first one that finishes and yield it, rinse, repeat.
        while futures:
            future = ndb.Future.wait_any(futures)
            # TODO(maruel): For items that were present, make sure
            # future.get_result().compressed_size == entry_info.size.
            yield futures.pop(future), bool(future.get_result())
Example #10
0
    def get(self, namespace, hash_key):  #pylint: disable=W0221
        # Parse 'Range' header if it's present to extract initial offset.
        # Only support single continuous range from some |offset| to the end.
        offset = 0
        range_header = self.request.headers.get('range')
        if range_header:
            match = re.match(r'bytes=(\d+)-', range_header)
            if not match:
                return self.send_error('Unsupported byte range.\n\'%s\'.' %
                                       range_header,
                                       http_code=416)
            offset = int(match.group(1))

        memcache_entry = memcache.get(hash_key,
                                      namespace='table_%s' % namespace)
        if memcache_entry is not None:
            self.send_data(memcache_entry, filename=hash_key, offset=offset)
            stats.add_entry(stats.RETURN,
                            len(memcache_entry) - offset, 'memcache')
            return

        entry = model.entry_key(namespace, hash_key).get()
        if not entry:
            return self.send_error('Unable to retrieve the entry.',
                                   http_code=404)

        if entry.content is not None:
            self.send_data(entry.content, filename=hash_key, offset=offset)
            stats.add_entry(stats.RETURN,
                            len(entry.content) - offset, 'inline')
            return

        # Generate signed download URL.
        settings = config.settings()
        # TODO(maruel): The GS object may not exist anymore. Handle this.
        signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email,
                               settings.gs_private_key)
        # The entry key is the GS filepath.
        signed_url = signer.get_download_url(entry.key.id())

        # Redirect client to this URL. If 'Range' header is used, client will
        # correctly pass it to Google Storage to fetch only subrange of file,
        # so update stats accordingly.
        self.redirect(signed_url)
        stats.add_entry(stats.RETURN, entry.compressed_size - offset,
                        'GS; %s' % entry.key.id())
Example #11
0
  def check_entry_infos(entries, namespace):
    """Generator that checks for EntryInfo entries existence.

    Yields pairs (EntryInfo object, True if such entry exists in Datastore).
    """
    # Kick off all queries in parallel. Build mapping Future -> digest.
    futures = {}
    for entry_info in entries:
      key = model.entry_key(namespace, entry_info.digest)
      futures[key.get_async(use_cache=False)] = entry_info

    # Pick first one that finishes and yield it, rinse, repeat.
    while futures:
      future = ndb.Future.wait_any(futures)
      # TODO(maruel): For items that were present, make sure
      # future.get_result().compressed_size == entry_info.size.
      yield futures.pop(future), bool(future.get_result())
Example #12
0
  def get(self, namespace, hash_key):  #pylint: disable=W0221
    # Parse 'Range' header if it's present to extract initial offset.
    # Only support single continuous range from some |offset| to the end.
    offset = 0
    range_header = self.request.headers.get('range')
    if range_header:
      match = re.match(r'bytes=(\d+)-', range_header)
      if not match:
        return self.send_error(
            'Unsupported byte range.\n\'%s\'.' % range_header, http_code=416)
      offset = int(match.group(1))

    memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace)
    if memcache_entry is not None:
      self.send_data(memcache_entry, filename=hash_key, offset=offset)
      stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache')
      return

    entry = model.entry_key(namespace, hash_key).get()
    if not entry:
      return self.send_error('Unable to retrieve the entry.', http_code=404)

    if entry.content is not None:
      self.send_data(entry.content, filename=hash_key, offset=offset)
      stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline')
      return

    # Generate signed download URL.
    settings = config.settings()
    # TODO(maruel): The GS object may not exist anymore. Handle this.
    signer = gcs.URLSigner(settings.gs_bucket,
        settings.gs_client_id_email, settings.gs_private_key)
    # The entry key is the GS filepath.
    signed_url = signer.get_download_url(entry.key.id())

    # Redirect client to this URL. If 'Range' header is used, client will
    # correctly pass it to Google Storage to fetch only subrange of file,
    # so update stats accordingly.
    self.redirect(signed_url)
    stats.add_entry(
        stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
Example #13
0
  def test_trim_missing(self):
    deleted = self.mock_delete_files()
    def gen_file(i, t=0):
      return (i, gcs.cloudstorage.GCSFileStat(i, 100, 'etag', t))
    mock_files = [
        # Was touched.
        gen_file('d/' + '0' * 40),
        # Is deleted.
        gen_file('d/' + '1' * 40),
        # Too recent.
        gen_file('d/' + '2' * 40, time.time() - 60),
    ]
    self.mock(gcs, 'list_files', lambda _: mock_files)

    model.ContentEntry(key=model.entry_key('d', '0' * 40)).put()
    headers = {'X-AppEngine-Cron': 'true'}
    resp = self.app_backend.get(
        '/internal/cron/cleanup/trigger/trim_lost', headers=headers)
    self.assertEqual(200, resp.status_code)
    self.assertEqual(1, self.execute_tasks())
    self.assertEqual(['d/' + '1' * 40], deleted)
Example #14
0
  def test_check_existing_finds_existing_entities(self):
    """Assert that existence check is working."""
    collection = generate_collection(
        ['small content', 'larger content', 'biggest content'])
    key = model.entry_key(
        collection.namespace.namespace, collection.items[0].digest)

    # guarantee that one digest already exists in the datastore
    model.new_content_entry(key).put()
    response = self.call_api(
        'preupload', self.message_to_dict(collection), 200)

    # we should see one enqueued task and two new URLs in the response
    items = response.json['items']
    self.assertEqual(2, len(items))
    self.assertEqual([1, 2], [int(item['index']) for item in items])
    for item in items:
      self.assertIsNotNone(item.get('upload_ticket'))

    # remove tasks so tearDown doesn't complain
    _ = self.execute_tasks()
Example #15
0
    def test_trim_missing(self):
        deleted = self.mock_delete_files()

        def gen_file(i, t=0):
            return (i, gcs.cloudstorage.GCSFileStat(i, 100, 'etag', t))

        mock_files = [
            # Was touched.
            gen_file('d/' + '0' * 40),
            # Is deleted.
            gen_file('d/' + '1' * 40),
            # Too recent.
            gen_file('d/' + '2' * 40,
                     time.time() - 60),
        ]
        self.mock(gcs, 'list_files', lambda _: mock_files)

        model.ContentEntry(key=model.entry_key('d', '0' * 40)).put()
        headers = {'X-AppEngine-Cron': 'true'}
        resp = self.app_backend.get('/internal/cron/cleanup/trigger/trim_lost',
                                    headers=headers)
        self.assertEqual(200, resp.status_code)
        self.assertEqual(1, self.execute_tasks())
        self.assertEqual(['d/' + '1' * 40], deleted)
Example #16
0
  def handle(self, namespace, hash_key):
    """Handles this request."""
    # Extract relevant request parameters.
    expiration_ts = self.request.get('x')
    item_size = self.request.get('s')
    is_isolated = self.request.get('i')
    uploaded_to_gs = self.request.get('g')
    signature = self.request.get('sig')

    # Build correct signature.
    expected_sig = self.generate_signature(
        config.settings().global_secret, self.request.method, expiration_ts,
        namespace, hash_key, item_size, is_isolated, uploaded_to_gs)

    # Verify signature is correct.
    if not utils.constant_time_equals(signature, expected_sig):
      return self.send_error('Incorrect signature.')

    # Convert parameters from strings back to something useful.
    # It can't fail since matching signature means it was us who generated
    # this strings in a first place.
    expiration_ts = int(expiration_ts)
    item_size = int(item_size)
    is_isolated = bool(int(is_isolated))
    uploaded_to_gs = bool(int(uploaded_to_gs))

    # Verify signature is not yet expired.
    if time.time() > expiration_ts:
      return self.send_error('Expired signature.')

    if uploaded_to_gs:
      # GS upload finalization uses empty POST body.
      assert self.request.method == 'POST'
      if self.request.headers.get('content-length'):
        return self.send_error('Expecting empty POST.')
      content = None
    else:
      # Datastore upload uses PUT.
      assert self.request.method == 'PUT'
      if self.request.headers.get('content-length'):
        content = self.request.body
      else:
        content = ''

    # Info about corresponding GS entry (if it exists).
    gs_bucket = config.settings().gs_bucket
    key = model.entry_key(namespace, hash_key)

    # Verify the data while at it since it's already in memory but before
    # storing it in memcache and datastore.
    if content is not None:
      # Verify advertised hash matches the data.
      try:
        hex_digest, expanded_size = hash_content(content, namespace)
        if hex_digest != hash_key:
          raise ValueError(
              'Hash and data do not match, '
              '%d bytes (%d bytes expanded)' % (len(content), expanded_size))
        if expanded_size != item_size:
          raise ValueError(
              'Advertised data length (%d) and actual data length (%d) '
              'do not match' % (item_size, expanded_size))
      except ValueError as err:
        return self.send_error('Inline verification failed.\n%s' % err)
      # Successfully verified!
      compressed_size = len(content)
      needs_verification = False
    else:
      # Fetch size of the stored file.
      file_info = gcs.get_file_info(gs_bucket, key.id())
      if not file_info:
        # TODO(maruel): Do not fail yet. If the request got up to here, the file
        # is likely there but the service may have trouble fetching the metadata
        # from GS.
        return self.send_error(
            'File should be in Google Storage.\nFile: \'%s\' Size: %d.' %
            (key.id(), item_size))
      compressed_size = file_info.size
      needs_verification = True

    # Data is here and it's too large for DS, so put it in GS. It is likely
    # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
    if content is not None and len(content) >= MIN_SIZE_FOR_GS:
      if not gcs.write_file(gs_bucket, key.id(), [content]):
        # Returns 503 so the client automatically retries.
        return self.send_error(
            'Unable to save the content to GS.', http_code=503)
      # It's now in GS.
      uploaded_to_gs = True

    # Can create entity now, everything appears to be legit.
    entry = model.new_content_entry(
        key=key,
        is_isolated=is_isolated,
        compressed_size=compressed_size,
        expanded_size=-1 if needs_verification else item_size,
        is_verified = not needs_verification)

    # If it's not in GS then put it inline.
    if not uploaded_to_gs:
      assert content is not None and len(content) < MIN_SIZE_FOR_GS
      entry.content = content

    # Start saving *.isolated into memcache iff its content is available and
    # it's not in Datastore: there's no point in saving inline blobs in memcache
    # because ndb already memcaches them.
    memcache_store_future = None
    if (content is not None and
        entry.content is None and
        entry.is_isolated and
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
      memcache_store_future = model.save_in_memcache(
          namespace, hash_key, content, async=True)

    try:
      # If entry was already verified above (i.e. it is a small inline entry),
      # store it right away, possibly overriding existing entity. Most of
      # the time it is a new entry anyway (since clients try to upload only
      # new entries).
      if not needs_verification:
        entry.put()
      else:
        # For large entries (that require expensive verification) be more
        # careful and check that it is indeed a new entity. No need to do it in
        # transaction: a race condition would lead to redundant verification
        # task enqueued, no big deal.
        existing = entry.key.get()
        if existing:
          if existing.is_verified:
            logging.info('Entity exists and already verified')
          else:
            logging.info('Entity exists, but not yet verified')
        else:
          # New entity. Store it and enqueue verification task, transactionally.
          task_queue_host = utils.get_task_queue_host()
          def run():
            entry.put()
            taskqueue.add(
                url='/internal/taskqueue/verify/%s' % entry.key.id(),
                queue_name='verify',
                headers={'Host': task_queue_host},
                transactional=True)
          datastore_utils.transaction(run)

      # TODO(vadimsh): Fill in details about the entry, such as expiration time.
      self.send_json({'entry': {}})

      # Log stats.
      where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id()
      stats.add_entry(stats.STORE, entry.compressed_size, where)

    finally:
      # Do not keep dangling futures. Note that error here is ignored,
      # memcache is just an optimization.
      if memcache_store_future:
        memcache_store_future.wait()
Example #17
0
    def post(self, namespace, hash_key):
        entry = model.entry_key(namespace, hash_key).get()
        if not entry:
            logging.error('Failed to find entity')
            return
        if entry.is_verified:
            logging.warning('Was already verified')
            return
        if entry.content is not None:
            logging.error('Should not be called with inline content')
            return

        # Get GS file size.
        gs_bucket = config.settings().gs_bucket
        gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

        # It's None if file is missing.
        if not gs_file_info:
            # According to the docs, GS is read-after-write consistent, so a file is
            # missing only if it wasn't stored at all or it was deleted, in any case
            # it's not a valid ContentEntry.
            self.purge_entry(entry, 'No such GS file')
            return

        # Expected stored length and actual length should match.
        if gs_file_info.size != entry.compressed_size:
            self.purge_entry(
                entry, 'Bad GS file: expected size is %d, actual size is %d',
                entry.compressed_size, gs_file_info.size)
            return

        save_to_memcache = (
            entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED
            and entry.is_isolated)
        expanded_size = 0
        digest = model.get_hash_algo(namespace)
        data = None

        try:
            # Start a loop where it reads the data in block.
            stream = gcs.read_file(gs_bucket, entry.key.id())
            if save_to_memcache:
                # Wraps stream with a generator that accumulates the data.
                stream = Accumulator(stream)

            for data in model.expand_content(namespace, stream):
                expanded_size += len(data)
                digest.update(data)
                # Make sure the data is GC'ed.
                del data

            # Hashes should match.
            if digest.hexdigest() != hash_key:
                self.purge_entry(
                    entry,
                    'SHA-1 do not match data (%d bytes, %d bytes expanded)',
                    entry.compressed_size, expanded_size)
                return

        except gcs.NotFoundError as e:
            # Somebody deleted a file between get_file_info and read_file calls.
            self.purge_entry(entry, 'File was unexpectedly deleted')
            return
        except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
            # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
            # be fine. Maybe ACL problems would be fixed before the next retry.
            logging.warning('CloudStorage auth issues (%s): %s',
                            e.__class__.__name__, e)
            # Abort so the job is retried automatically.
            return self.abort(500)
        except (gcs.FatalError, zlib.error, IOError) as e:
            # ForbiddenError and AuthorizationError inherit FatalError, so this except
            # block should be last.
            # It's broken or unreadable.
            self.purge_entry(entry, 'Failed to read the file (%s): %s',
                             e.__class__.__name__, e)
            return

        # Verified. Data matches the hash.
        entry.expanded_size = expanded_size
        entry.is_verified = True
        future = entry.put_async()
        logging.info('%d bytes (%d bytes expanded) verified',
                     entry.compressed_size, expanded_size)
        if save_to_memcache:
            model.save_in_memcache(namespace, hash_key,
                                   ''.join(stream.accumulated))
        future.wait()
Example #18
0
    def handle(self, namespace, hash_key):
        """Handles this request."""
        # Extract relevant request parameters.
        expiration_ts = self.request.get('x')
        item_size = self.request.get('s')
        is_isolated = self.request.get('i')
        uploaded_to_gs = self.request.get('g')
        signature = self.request.get('sig')

        # Build correct signature.
        expected_sig = self.generate_signature(config.settings().global_secret,
                                               self.request.method,
                                               expiration_ts, namespace,
                                               hash_key, item_size,
                                               is_isolated, uploaded_to_gs)

        # Verify signature is correct.
        if not utils.constant_time_equals(signature, expected_sig):
            return self.send_error('Incorrect signature.')

        # Convert parameters from strings back to something useful.
        # It can't fail since matching signature means it was us who generated
        # this strings in a first place.
        expiration_ts = int(expiration_ts)
        item_size = int(item_size)
        is_isolated = bool(int(is_isolated))
        uploaded_to_gs = bool(int(uploaded_to_gs))

        # Verify signature is not yet expired.
        if time.time() > expiration_ts:
            return self.send_error('Expired signature.')

        if uploaded_to_gs:
            # GS upload finalization uses empty POST body.
            assert self.request.method == 'POST'
            if self.request.headers.get('content-length'):
                return self.send_error('Expecting empty POST.')
            content = None
        else:
            # Datastore upload uses PUT.
            assert self.request.method == 'PUT'
            if self.request.headers.get('content-length'):
                content = self.request.body
            else:
                content = ''

        # Info about corresponding GS entry (if it exists).
        gs_bucket = config.settings().gs_bucket
        key = model.entry_key(namespace, hash_key)

        # Verify the data while at it since it's already in memory but before
        # storing it in memcache and datastore.
        if content is not None:
            # Verify advertised hash matches the data.
            try:
                hex_digest, expanded_size = hash_content(content, namespace)
                if hex_digest != hash_key:
                    raise ValueError('Hash and data do not match, '
                                     '%d bytes (%d bytes expanded)' %
                                     (len(content), expanded_size))
                if expanded_size != item_size:
                    raise ValueError(
                        'Advertised data length (%d) and actual data length (%d) '
                        'do not match' % (item_size, expanded_size))
            except ValueError as err:
                return self.send_error('Inline verification failed.\n%s' % err)
            # Successfully verified!
            compressed_size = len(content)
            needs_verification = False
        else:
            # Fetch size of the stored file.
            file_info = gcs.get_file_info(gs_bucket, key.id())
            if not file_info:
                # TODO(maruel): Do not fail yet. If the request got up to here, the file
                # is likely there but the service may have trouble fetching the metadata
                # from GS.
                return self.send_error(
                    'File should be in Google Storage.\nFile: \'%s\' Size: %d.'
                    % (key.id(), item_size))
            compressed_size = file_info.size
            needs_verification = True

        # Data is here and it's too large for DS, so put it in GS. It is likely
        # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS
        if content is not None and len(content) >= MIN_SIZE_FOR_GS:
            if not gcs.write_file(gs_bucket, key.id(), [content]):
                # Returns 503 so the client automatically retries.
                return self.send_error('Unable to save the content to GS.',
                                       http_code=503)
            # It's now in GS.
            uploaded_to_gs = True

        # Can create entity now, everything appears to be legit.
        entry = model.new_content_entry(
            key=key,
            is_isolated=is_isolated,
            compressed_size=compressed_size,
            expanded_size=-1 if needs_verification else item_size,
            is_verified=not needs_verification)

        # If it's not in GS then put it inline.
        if not uploaded_to_gs:
            assert content is not None and len(content) < MIN_SIZE_FOR_GS
            entry.content = content

        # Start saving *.isolated into memcache iff its content is available and
        # it's not in Datastore: there's no point in saving inline blobs in memcache
        # because ndb already memcaches them.
        memcache_store_future = None
        if (content is not None and entry.content is None and entry.is_isolated
                and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED):
            memcache_store_future = model.save_in_memcache(namespace,
                                                           hash_key,
                                                           content,
                                                           async=True)

        try:
            # If entry was already verified above (i.e. it is a small inline entry),
            # store it right away, possibly overriding existing entity. Most of
            # the time it is a new entry anyway (since clients try to upload only
            # new entries).
            if not needs_verification:
                entry.put()
            else:
                # For large entries (that require expensive verification) be more
                # careful and check that it is indeed a new entity. No need to do it in
                # transaction: a race condition would lead to redundant verification
                # task enqueued, no big deal.
                existing = entry.key.get()
                if existing:
                    if existing.is_verified:
                        logging.info('Entity exists and already verified')
                    else:
                        logging.info('Entity exists, but not yet verified')
                else:
                    # New entity. Store it and enqueue verification task, transactionally.
                    task_queue_host = utils.get_task_queue_host()

                    def run():
                        entry.put()
                        taskqueue.add(url='/internal/taskqueue/verify/%s' %
                                      entry.key.id(),
                                      queue_name='verify',
                                      headers={'Host': task_queue_host},
                                      transactional=True)

                    datastore_utils.transaction(run)

            # TODO(vadimsh): Fill in details about the entry, such as expiration time.
            self.send_json({'entry': {}})

            # Log stats.
            where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id(
            )
            stats.add_entry(stats.STORE, entry.compressed_size, where)

        finally:
            # Do not keep dangling futures. Note that error here is ignored,
            # memcache is just an optimization.
            if memcache_store_future:
                memcache_store_future.wait()
Example #19
0
def entry_key_or_error(namespace, digest):
  try:
    return model.entry_key(namespace, digest)
  except ValueError as error:
    raise endpoints.BadRequestException(error.message)
Example #20
0
  def post(self, namespace, hash_key):
    entry = model.entry_key(namespace, hash_key).get()
    if not entry:
      logging.error('Failed to find entity')
      return
    if entry.is_verified:
      logging.warning('Was already verified')
      return
    if entry.content is not None:
      logging.error('Should not be called with inline content')
      return

    # Get GS file size.
    gs_bucket = config.settings().gs_bucket
    gs_file_info = gcs.get_file_info(gs_bucket, entry.key.id())

    # It's None if file is missing.
    if not gs_file_info:
      # According to the docs, GS is read-after-write consistent, so a file is
      # missing only if it wasn't stored at all or it was deleted, in any case
      # it's not a valid ContentEntry.
      self.purge_entry(entry, 'No such GS file')
      return

    # Expected stored length and actual length should match.
    if gs_file_info.size != entry.compressed_size:
      self.purge_entry(entry,
          'Bad GS file: expected size is %d, actual size is %d',
          entry.compressed_size, gs_file_info.size)
      return

    save_to_memcache = (
        entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED and
        entry.is_isolated)
    expanded_size = 0
    digest = model.get_hash_algo(namespace)
    data = None

    try:
      # Start a loop where it reads the data in block.
      stream = gcs.read_file(gs_bucket, entry.key.id())
      if save_to_memcache:
        # Wraps stream with a generator that accumulates the data.
        stream = Accumulator(stream)

      for data in model.expand_content(namespace, stream):
        expanded_size += len(data)
        digest.update(data)
        # Make sure the data is GC'ed.
        del data

      # Hashes should match.
      if digest.hexdigest() != hash_key:
        self.purge_entry(entry,
            'SHA-1 do not match data (%d bytes, %d bytes expanded)',
            entry.compressed_size, expanded_size)
        return

    except gcs.NotFoundError as e:
      # Somebody deleted a file between get_file_info and read_file calls.
      self.purge_entry(entry, 'File was unexpectedly deleted')
      return
    except (gcs.ForbiddenError, gcs.AuthorizationError) as e:
      # Misconfiguration in Google Storage ACLs. Don't delete an entry, it may
      # be fine. Maybe ACL problems would be fixed before the next retry.
      logging.warning(
          'CloudStorage auth issues (%s): %s', e.__class__.__name__, e)
      # Abort so the job is retried automatically.
      return self.abort(500)
    except (gcs.FatalError, zlib.error, IOError) as e:
      # ForbiddenError and AuthorizationError inherit FatalError, so this except
      # block should be last.
      # It's broken or unreadable.
      self.purge_entry(entry,
          'Failed to read the file (%s): %s', e.__class__.__name__, e)
      return

    # Verified. Data matches the hash.
    entry.expanded_size = expanded_size
    entry.is_verified = True
    future = entry.put_async()
    logging.info(
        '%d bytes (%d bytes expanded) verified',
        entry.compressed_size, expanded_size)
    if save_to_memcache:
      model.save_in_memcache(namespace, hash_key, ''.join(stream.accumulated))
    future.wait()
Example #21
0
def entry_key_or_error(namespace, digest):
    try:
        return model.entry_key(namespace, digest)
    except ValueError as error:
        raise endpoints.BadRequestException(error.message)