def post(self, namespace): """Reads body with items to upload and replies with URLs to upload to.""" if not re.match(r"^%s$" % model.NAMESPACE_RE, namespace): self.send_error('Invalid namespace; allowed keys must pass regexp "%s"' % model.NAMESPACE_RE) # Parse a body into list of EntryInfo objects. try: entries = self.parse_request(self.request.body) except ValueError as err: return self.send_error("Bad /pre-upload request.\n(%s)\n%s" % (err, self.request.body[:200])) # Generate push_urls for missing entries. push_urls = {} existing = [] for entry_info, exists in self.check_entry_infos(entries, namespace): if exists: existing.append(entry_info) else: push_urls[entry_info.digest] = self.generate_push_urls(entry_info, namespace) # Send back the response. self.send_json([push_urls.get(entry_info.digest) for entry_info in entries]) # Log stats, enqueue tagging task that updates last access time. stats.add_entry(stats.LOOKUP, len(entries), len(existing)) if existing: # Ignore errors in a call below. They happen when task queue service has # a bad time and doesn't accept tagging tasks. We don't want isolate # server's reliability to depend on task queue service health. An ignored # error here means there's a chance some entry might be deleted sooner # than it should. self.tag_entries(existing, namespace)
def retrieve(self, request): """Retrieves content from a storage location.""" content = None key = None offset = request.offset if not request.digest: raise endpoints.BadRequestException('digest is required.') if not request.namespace: raise endpoints.BadRequestException('namespace is required.') # Try memcache. memcache_entry = memcache.get(request.digest, namespace='table_%s' % request.namespace.namespace) if memcache_entry is not None: content = memcache_entry found = 'memcache' else: key = entry_key_or_error(request.namespace.namespace, request.digest) stored = key.get() if stored is None: logging.debug('%s', request.digest) raise endpoints.NotFoundException( 'Unable to retrieve the entry.') content = stored.content # will be None if entity is in GCS found = 'inline' # Return and log stats here if something has been found. if content is not None: # make sure that offset is acceptable logging.debug('%s', request.digest) if offset < 0 or offset > len(content): raise endpoints.BadRequestException( 'Invalid offset %d. Offset must be between 0 and content length.' % offset) stats.add_entry(stats.RETURN, len(content) - offset, found) return RetrievedContent(content=content[offset:]) # The data is in GS; log stats and return the URL. if offset < 0 or offset > stored.compressed_size: logging.debug('%s', request.digest) raise endpoints.BadRequestException( 'Invalid offset %d. Offset must be between 0 and content length.' % offset) metrics.file_size(stored.compressed_size - offset) stats.add_entry(stats.RETURN, stored.compressed_size - offset, 'GS; %s' % stored.key.id()) return RetrievedContent(url=self.gs_url_signer.get_download_url( filename=key.id(), expiration=DEFAULT_LINK_EXPIRATION))
def retrieve(self, request): """Retrieves content from a storage location.""" content = None key = None offset = request.offset # try the memcache memcache_entry = memcache.get( request.digest, namespace='table_%s' % request.namespace.namespace) if memcache_entry is not None: content = memcache_entry found = 'memcache' else: key = entry_key_or_error(request.namespace.namespace, request.digest) stored = key.get() if stored is None: logging.debug('%s', request.digest) raise endpoints.NotFoundException('Unable to retrieve the entry.') content = stored.content # will be None if entity is in GCS found = 'inline' # Return and log stats here if something has been found. if content is not None: # make sure that offset is acceptable logging.debug('%s', request.digest) if offset < 0 or offset > len(content): raise endpoints.BadRequestException( 'Invalid offset %d. Offset must be between 0 and content length.' % offset) stats.add_entry(stats.RETURN, len(content) - offset, found) return RetrievedContent(content=content[offset:]) # The data is in GS; log stats and return the URL. if offset < 0 or offset > stored.compressed_size: logging.debug('%s', request.digest) raise endpoints.BadRequestException( 'Invalid offset %d. Offset must be between 0 and content length.' % offset) stats.add_entry( stats.RETURN, stored.compressed_size - offset, 'GS; %s' % stored.key.id()) return RetrievedContent(url=self.gs_url_signer.get_download_url( filename=key.id(), expiration=DEFAULT_LINK_EXPIRATION))
def post(self, namespace): """Reads body with items to upload and replies with URLs to upload to.""" logging.error('Unexpected old client') if not re.match(r'^%s$' % model.NAMESPACE_RE, namespace): self.send_error( 'Invalid namespace; allowed keys must pass regexp "%s"' % model.NAMESPACE_RE) # Parse a body into list of EntryInfo objects. try: entries = self.parse_request(self.request.body) except ValueError as err: return self.send_error('Bad /pre-upload request.\n(%s)\n%s' % (err, self.request.body[:200])) # Generate push_urls for missing entries. push_urls = {} existing = [] for entry_info, exists in self.check_entry_infos(entries, namespace): if exists: existing.append(entry_info) else: push_urls[entry_info.digest] = self.generate_push_urls( entry_info, namespace) # Send back the response. self.send_json( [push_urls.get(entry_info.digest) for entry_info in entries]) # Log stats, enqueue tagging task that updates last access time. stats.add_entry(stats.LOOKUP, len(entries), len(existing)) if existing: # Ignore errors in a call below. They happen when task queue service has # a bad time and doesn't accept tagging tasks. We don't want isolate # server's reliability to depend on task queue service health. An ignored # error here means there's a chance some entry might be deleted sooner # than it should. self.tag_entries(existing, namespace)
def get(self, namespace, hash_key): #pylint: disable=W0221 logging.error('Unexpected old client') # Parse 'Range' header if it's present to extract initial offset. # Only support single continuous range from some |offset| to the end. offset = 0 range_header = self.request.headers.get('range') if range_header: match = re.match(r'bytes=(\d+)-', range_header) if not match: return self.send_error('Unsupported byte range.\n\'%s\'.' % range_header, http_code=416) offset = int(match.group(1)) memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace) if memcache_entry is not None: self.send_data(memcache_entry, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache') return entry = model.get_entry_key(namespace, hash_key).get() if not entry: return self.send_error('Unable to retrieve the entry.', http_code=404) if entry.content is not None: self.send_data(entry.content, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline') return # Generate signed download URL. settings = config.settings() # TODO(maruel): The GS object may not exist anymore. Handle this. signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email, settings.gs_private_key) # The entry key is the GS filepath. signed_url = signer.get_download_url(entry.key.id()) # Redirect client to this URL. If 'Range' header is used, client will # correctly pass it to Google Storage to fetch only subrange of file, # so update stats accordingly. self.redirect(signed_url) stats.add_entry(stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
def get(self, namespace, hash_key): #pylint: disable=W0221 logging.error('Unexpected old client') # Parse 'Range' header if it's present to extract initial offset. # Only support single continuous range from some |offset| to the end. offset = 0 range_header = self.request.headers.get('range') if range_header: match = re.match(r'bytes=(\d+)-', range_header) if not match: return self.send_error( 'Unsupported byte range.\n\'%s\'.' % range_header, http_code=416) offset = int(match.group(1)) memcache_entry = memcache.get(hash_key, namespace='table_%s' % namespace) if memcache_entry is not None: self.send_data(memcache_entry, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(memcache_entry) - offset, 'memcache') return entry = model.get_entry_key(namespace, hash_key).get() if not entry: return self.send_error('Unable to retrieve the entry.', http_code=404) if entry.content is not None: self.send_data(entry.content, filename=hash_key, offset=offset) stats.add_entry(stats.RETURN, len(entry.content) - offset, 'inline') return # Generate signed download URL. settings = config.settings() # TODO(maruel): The GS object may not exist anymore. Handle this. signer = gcs.URLSigner(settings.gs_bucket, settings.gs_client_id_email, settings.gs_private_key) # The entry key is the GS filepath. signed_url = signer.get_download_url(entry.key.id()) # Redirect client to this URL. If 'Range' header is used, client will # correctly pass it to Google Storage to fetch only subrange of file, # so update stats accordingly. self.redirect(signed_url) stats.add_entry( stats.RETURN, entry.compressed_size - offset, 'GS; %s' % entry.key.id())
def storage_helper(request, uploaded_to_gs): """Implement shared logic between store_inline and finalize_gs. Arguments: request: either StorageRequest or FinalizeRequest. uploaded_to_gs: bool. """ if not request.upload_ticket: raise endpoints.BadRequestException( 'Upload ticket was empty or not provided.') try: embedded = TokenSigner.validate(request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs]) except (auth.InvalidTokenError, ValueError) as error: raise endpoints.BadRequestException( 'Ticket validation failed: %s' % error.message) digest = embedded['d'].encode('utf-8') is_isolated = bool(int(embedded['i'])) namespace = embedded['n'] size = int(embedded['s']) key = entry_key_or_error(namespace, digest) if uploaded_to_gs: # Ensure that file info is uploaded to GS first. file_info = gcs.get_file_info(config.settings().gs_bucket, key.id()) if not file_info: logging.debug('%s', digest) raise endpoints.BadRequestException( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), size)) content = None compressed_size = file_info.size else: content = request.content compressed_size = len(content) # Look if the entity was already stored. Alert in that case but ignore it. if key.get(): # TODO(maruel): Handle these more gracefully. logging.warning('Overwritting ContentEntry\n%s', digest) entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=size, is_verified=not uploaded_to_gs, content=content, ) if not uploaded_to_gs: # Assert that embedded content is the data sent by the request. logging.debug('%s', digest) if (digest, size) != hash_content(content, namespace): raise endpoints.BadRequestException( 'Embedded digest does not match provided data: ' '(digest, size): (%r, %r); expected: %r' % (digest, size, hash_content(content, namespace))) try: entry.put() except datastore_errors.Error as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) else: # Enqueue verification task transactionally as the entity is stored. try: store_and_enqueue_verify_task(entry, utils.get_task_queue_host()) except (datastore_errors.Error, runtime.apiproxy_errors.CancelledError, runtime.apiproxy_errors.DeadlineExceededError, runtime.apiproxy_errors.OverQuotaError, runtime.DeadlineExceededError, taskqueue.Error) as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) stats.add_entry( stats.STORE, entry.compressed_size, 'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline') return PushPing(ok=True)
def get(self): """Generates fake stats.""" stats.add_entry(stats.DUPE, 1024, 'inline') self.response.write('Yay')
def get(self): """Generates fake stats.""" stats.add_entry(stats.LOOKUP, 200, 103) self.response.write('Yay')
def get(self): """Generates fake stats.""" stats.add_entry(stats.RETURN, 4096, 'memcache') self.response.write('Yay')
def get(self): """Generates fake stats.""" stats.add_entry(stats.STORE, 2048, 'GS; inline') self.response.write('Yay')
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature( config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError( 'Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error( 'Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified = not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache( namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add( url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id() stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()
def storage_helper(request, uploaded_to_gs): """Implement shared logic between store_inline and finalize_gs. Arguments: request: either StorageRequest or FinalizeRequest. uploaded_to_gs: bool. """ if not request.upload_ticket: raise endpoints.BadRequestException( 'Upload ticket was empty or not provided.') try: embedded = TokenSigner.validate( request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs]) except (auth.InvalidTokenError, ValueError) as error: raise endpoints.BadRequestException( 'Ticket validation failed: %s' % error.message) digest = embedded['d'].encode('utf-8') is_isolated = bool(int(embedded['i'])) namespace = embedded['n'] size = int(embedded['s']) key = entry_key_or_error(namespace, digest) if uploaded_to_gs: # Ensure that file info is uploaded to GS first. file_info = gcs.get_file_info(config.settings().gs_bucket, key.id()) if not file_info: logging.debug('%s', digest) raise endpoints.BadRequestException( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % ( key.id(), size)) content = None compressed_size = file_info.size else: content = request.content compressed_size = len(content) # Look if the entity was already stored. Alert in that case but ignore it. if key.get(): # TODO(maruel): Handle these more gracefully. logging.warning('Overwritting ContentEntry\n%s', digest) entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=size, is_verified=not uploaded_to_gs, content=content, ) if not uploaded_to_gs: # Assert that embedded content is the data sent by the request. logging.debug('%s', digest) if (digest, size) != hash_content(content, namespace): raise endpoints.BadRequestException( 'Embedded digest does not match provided data: ' '(digest, size): (%r, %r); expected: %r' % ( digest, size, hash_content(content, namespace))) entry.put() else: # Enqueue verification task transactionally as the entity is stored. try: store_and_enqueue_verify_task(entry, utils.get_task_queue_host()) except ( datastore_errors.Error, runtime.apiproxy_errors.CancelledError, runtime.apiproxy_errors.DeadlineExceededError, runtime.apiproxy_errors.OverQuotaError, runtime.DeadlineExceededError, taskqueue.Error) as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) stats.add_entry( stats.STORE, entry.compressed_size, 'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline') return PushPing(ok=True)
def preupload(self, request): """Checks for entry's existence and generates upload URLs. Arguments: request: the DigestCollection to be posted Returns: the UrlCollection corresponding to the uploaded digests The response list is commensurate to the request's; each UrlMessage has * if an entry is missing: two URLs: the URL to upload a file to and the URL to call when the upload is done (can be null). * if the entry is already present: null URLs (''). UrlCollection([ UrlMessage( upload_url = "<upload url>" finalize_url = "<finalize url>" ) UrlMessage( upload_url = '') ... ]) """ response = UrlCollection(items=[]) # check for namespace error if not re.match(r'^%s$' % model.NAMESPACE_RE, request.namespace.namespace): raise endpoints.BadRequestException( 'Invalid namespace; allowed keys must pass regexp "%s"' % model.NAMESPACE_RE) if len(request.items) > 1000: raise endpoints.BadRequestException( 'Only up to 1000 items can be looked up at once') # check for existing elements new_digests, existing_digests = self.partition_collection(request) # process all elements; add an upload ticket for cache misses for index, digest_element in enumerate(request.items): # check for error conditions if not model.is_valid_hex(digest_element.digest): raise endpoints.BadRequestException( 'Invalid hex code: %s' % (digest_element.digest)) if digest_element in new_digests: # generate preupload ticket status = PreuploadStatus( index=index, upload_ticket=self.generate_ticket( digest_element, request.namespace)) # generate GS upload URL if necessary if self.should_push_to_gs(digest_element): key = entry_key_or_error( request.namespace.namespace, digest_element.digest) status.gs_upload_url = self.gs_url_signer.get_upload_url( filename=key.id(), content_type='application/octet-stream', expiration=DEFAULT_LINK_EXPIRATION) response.items.append(status) # Tag existing entities and collect stats. self.tag_existing(DigestCollection( items=list(existing_digests), namespace=request.namespace)) stats.add_entry(stats.LOOKUP, len(request.items), len(existing_digests)) return response
def storage_helper(self, request, uploaded_to_gs): """Implement shared logic between store_inline and finalize_gs.""" # validate token or error out if not request.upload_ticket: raise endpoints.BadRequestException( 'Upload ticket was empty or not provided.') try: embedded = TokenSigner.validate( request.upload_ticket, UPLOAD_MESSAGES[uploaded_to_gs]) except (auth.InvalidTokenError, ValueError) as error: raise endpoints.BadRequestException( 'Ticket validation failed: %s' % error.message) # read data and convert types digest = embedded['d'].encode('utf-8') is_isolated = bool(int(embedded['i'])) namespace = embedded['n'] size = int(embedded['s']) # create a key key = entry_key_or_error(namespace, digest) # get content and compressed size if uploaded_to_gs: # ensure that file info is uploaded to GS first # TODO(cmassaro): address analogous TODO from handlers_api file_info = gcs.get_file_info(config.settings().gs_bucket, key.id()) if not file_info: raise endpoints.BadRequestException( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % ( key.id(), size)) content = None compressed_size = file_info.size else: content = request.content compressed_size = len(content) # all is well; create an entry entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=size, is_verified=not uploaded_to_gs, content=content, ) # DB: assert that embedded content is the data sent by the request if not uploaded_to_gs: if (digest, size) != hash_content(content, namespace): raise endpoints.BadRequestException( 'Embedded digest does not match provided data: ' '(digest, size): (%r, %r); expected: %r' % ( digest, size, hash_content(content, namespace))) entry.put() # GCS: enqueue verification task else: try: store_and_enqueue_verify_task(entry, utils.get_task_queue_host()) except ( datastore_errors.Error, runtime.apiproxy_errors.CancelledError, runtime.apiproxy_errors.DeadlineExceededError, runtime.apiproxy_errors.OverQuotaError, runtime.DeadlineExceededError, taskqueue.Error) as e: raise endpoints.InternalServerErrorException( 'Unable to store the entity: %s.' % e.__class__.__name__) stats.add_entry( stats.STORE, entry.compressed_size, 'GS; %s' % entry.key.id() if uploaded_to_gs else 'inline') return PushPing(ok=True)
def handle(self, namespace, hash_key): """Handles this request.""" # Extract relevant request parameters. expiration_ts = self.request.get('x') item_size = self.request.get('s') is_isolated = self.request.get('i') uploaded_to_gs = self.request.get('g') signature = self.request.get('sig') # Build correct signature. expected_sig = self.generate_signature(config.settings().global_secret, self.request.method, expiration_ts, namespace, hash_key, item_size, is_isolated, uploaded_to_gs) # Verify signature is correct. if not utils.constant_time_equals(signature, expected_sig): return self.send_error('Incorrect signature.') # Convert parameters from strings back to something useful. # It can't fail since matching signature means it was us who generated # this strings in a first place. expiration_ts = int(expiration_ts) item_size = int(item_size) is_isolated = bool(int(is_isolated)) uploaded_to_gs = bool(int(uploaded_to_gs)) # Verify signature is not yet expired. if time.time() > expiration_ts: return self.send_error('Expired signature.') if uploaded_to_gs: # GS upload finalization uses empty POST body. assert self.request.method == 'POST' if self.request.headers.get('content-length'): return self.send_error('Expecting empty POST.') content = None else: # Datastore upload uses PUT. assert self.request.method == 'PUT' if self.request.headers.get('content-length'): content = self.request.body else: content = '' # Info about corresponding GS entry (if it exists). gs_bucket = config.settings().gs_bucket key = model.entry_key(namespace, hash_key) # Verify the data while at it since it's already in memory but before # storing it in memcache and datastore. if content is not None: # Verify advertised hash matches the data. try: hex_digest, expanded_size = hash_content(content, namespace) if hex_digest != hash_key: raise ValueError('Hash and data do not match, ' '%d bytes (%d bytes expanded)' % (len(content), expanded_size)) if expanded_size != item_size: raise ValueError( 'Advertised data length (%d) and actual data length (%d) ' 'do not match' % (item_size, expanded_size)) except ValueError as err: return self.send_error('Inline verification failed.\n%s' % err) # Successfully verified! compressed_size = len(content) needs_verification = False else: # Fetch size of the stored file. file_info = gcs.get_file_info(gs_bucket, key.id()) if not file_info: # TODO(maruel): Do not fail yet. If the request got up to here, the file # is likely there but the service may have trouble fetching the metadata # from GS. return self.send_error( 'File should be in Google Storage.\nFile: \'%s\' Size: %d.' % (key.id(), item_size)) compressed_size = file_info.size needs_verification = True # Data is here and it's too large for DS, so put it in GS. It is likely # between MIN_SIZE_FOR_GS <= len(content) < MIN_SIZE_FOR_DIRECT_GS if content is not None and len(content) >= MIN_SIZE_FOR_GS: if not gcs.write_file(gs_bucket, key.id(), [content]): # Returns 503 so the client automatically retries. return self.send_error('Unable to save the content to GS.', http_code=503) # It's now in GS. uploaded_to_gs = True # Can create entity now, everything appears to be legit. entry = model.new_content_entry( key=key, is_isolated=is_isolated, compressed_size=compressed_size, expanded_size=-1 if needs_verification else item_size, is_verified=not needs_verification) # If it's not in GS then put it inline. if not uploaded_to_gs: assert content is not None and len(content) < MIN_SIZE_FOR_GS entry.content = content # Start saving *.isolated into memcache iff its content is available and # it's not in Datastore: there's no point in saving inline blobs in memcache # because ndb already memcaches them. memcache_store_future = None if (content is not None and entry.content is None and entry.is_isolated and entry.compressed_size <= model.MAX_MEMCACHE_ISOLATED): memcache_store_future = model.save_in_memcache(namespace, hash_key, content, async=True) try: # If entry was already verified above (i.e. it is a small inline entry), # store it right away, possibly overriding existing entity. Most of # the time it is a new entry anyway (since clients try to upload only # new entries). if not needs_verification: entry.put() else: # For large entries (that require expensive verification) be more # careful and check that it is indeed a new entity. No need to do it in # transaction: a race condition would lead to redundant verification # task enqueued, no big deal. existing = entry.key.get() if existing: if existing.is_verified: logging.info('Entity exists and already verified') else: logging.info('Entity exists, but not yet verified') else: # New entity. Store it and enqueue verification task, transactionally. task_queue_host = utils.get_task_queue_host() def run(): entry.put() taskqueue.add(url='/internal/taskqueue/verify/%s' % entry.key.id(), queue_name='verify', headers={'Host': task_queue_host}, transactional=True) datastore_utils.transaction(run) # TODO(vadimsh): Fill in details about the entry, such as expiration time. self.send_json({'entry': {}}) # Log stats. where = 'GS; ' + 'inline' if entry.content is not None else entry.key.id( ) stats.add_entry(stats.STORE, entry.compressed_size, where) finally: # Do not keep dangling futures. Note that error here is ignored, # memcache is just an optimization. if memcache_store_future: memcache_store_future.wait()