def process_keys(self): """Remove keys from the checkout bucket.""" for _key in self.keys: if DSS_BUNDLE_KEY_REGEX.match(_key): for key in self.handle.list(self.checkout_bucket, _key): # handles checkout/bundle/* self._verify_delete(self.handle, self.checkout_bucket, key) uuid, version = self._parse_key(_key) manifest = get_bundle_manifest(replica=self.replica, uuid=uuid, version=version) if manifest is None: sys.stderr.write( f"Unable to locate manifest for: {self.checkout_bucket}/{_key}" ) continue for _files in manifest['files']: key = compose_blob_key(_files) self._verify_delete(self.handle, self.checkout_bucket, key) elif _key.startswith(FILE_PREFIX): # should handle other keys, files/blobs file_metadata = self._get_metadata(self.handle, self.replica.bucket, _key) self._verify_delete(self.handle, self.checkout_bucket, key=compose_blob_key(file_metadata)) else: sys.stderr.write(f'Invalid key regex: {_key}')
def resolve_content_item(replica: Replica, blobstore_handle: BlobStore, item: dict): try: if item["type"] in {"file", "bundle", "collection"}: item_metadata = get_json_metadata(item["type"], item["uuid"], item["version"], replica, blobstore_handle) else: item_metadata = get_json_metadata("file", item["uuid"], item["version"], replica, blobstore_handle) if "fragment" not in item: raise Exception( 'The "fragment" field is required in collection elements ' 'other than files, bundles, and collections') blob_path = compose_blob_key(item_metadata) # check that item is marked as metadata, is json, and is less than max size item_doc = json.loads( blobstore_handle.get(replica.bucket, blob_path)) item_content = jsonpointer.resolve_pointer(item_doc, item["fragment"]) return item_content except DSSException: raise except Exception as e: raise DSSException( requests.codes.unprocessable_entity, "invalid_link", 'Error while parsing the link "{}": {}: {}'.format( item, type(e).__name__, e))
def verify_file_replication(src_handle, dst_handle, src_bucket, dst_bucket, key): """ Return list of ReplicationAnomaly for files+blobs """ anomalies = list() try: file_metadata = json.loads(src_handle.get(src_bucket, key)) except BlobNotFoundError: anomalies.append( ReplicationAnomaly(key=key, anomaly="missing on source replica")) else: try: target_file_metadata = json.loads(dst_handle.get(dst_bucket, key)) except BlobNotFoundError: anomalies.append( ReplicationAnomaly(key=key, anomaly="missing on target replica")) else: if file_metadata != target_file_metadata: anomalies.append( ReplicationAnomaly(key=key, anomaly="file metadata mismatch")) blob_key = compose_blob_key(file_metadata) anomalies.extend( verify_blob_replication(src_handle, dst_handle, src_bucket, dst_bucket, blob_key)) return anomalies
def process_keys(self): for _key in self.keys: if DSS_BUNDLE_KEY_REGEX.match(_key): uuid, version = self._parse_key(_key) bundle_manifest = get_bundle_manifest(uuid=uuid, replica=self.replica, version=version) self._sleepy_checkout(bundle_checkout, bundle_uuid=uuid, bundle_version=version) for _files in bundle_manifest['files']: blob_path = compose_blob_key(_files) self._sleepy_checkout(file_checkout, file_metadata=_files, blob_path=blob_path) elif _key.startswith(FILE_PREFIX): file_metadata = self.handle.get(self.replica.bucket, _key) blob_path = compose_blob_key(file_metadata) self._sleepy_checkout(file_checkout, file_metadata=file_metadata, blob_path=blob_path) else: sys.stderr.write(f'Invalid key regex: {_key}')
def get_manifest_files(replica: Replica, src_bucket: str, bundle_uuid: str, bundle_version: str): bundle_manifest = get_bundle_manifest(bundle_uuid, replica, bundle_version, bucket=src_bucket) files = bundle_manifest[BundleMetadata.FILES] dst_bundle_prefix = get_dst_bundle_prefix( bundle_uuid, bundle_manifest[BundleMetadata.VERSION]) for file_metadata in files: dst_key = "{}/{}".format(dst_bundle_prefix, file_metadata.get(BundleFileMetadata.NAME)) src_key = compose_blob_key(file_metadata) yield src_key, dst_key
def process_key(self, _key): if self._is_file_tombstoned(_key): return # skip if tombstoned file_metadata = self._get_metadata(self.handle, self.replica.bucket, _key) if not file_metadata: return # skip if missing metadata (edge case where the file was deleted before we got here) # check if file meets cache criteria if cache_flow.should_cache_file(file_metadata['content-type'], file_metadata['size']): blob_key = compose_blob_key(file_metadata) checked_out = self._verify_blob_existance( self.handle, self.replica.checkout_bucket, blob_key) if not checked_out: print(f'Checking out: {_key}') start_file_checkout(replica=self.replica, blob_key=blob_key) assert self._verify_blob_existance( self.handle, self.replica.checkout_bucket, blob_key)
def _read_file_infos(cls, replica: Replica, fqid: BundleFQID, manifest: JSON) -> List[Tuple[str, JSON]]: handle = Config.get_blobstore_handle(replica) index_files: List[Tuple[str, JSON]] = list() file_infos = manifest[BundleMetadata.FILES] assert isinstance(file_infos, list) for file_info in file_infos: if file_info[BundleFileMetadata.INDEXED]: file_name = file_info[BundleFileMetadata.NAME] content_type = file_info[BundleFileMetadata.CONTENT_TYPE] if content_type.startswith('application/json'): file_blob_key = compose_blob_key(file_info) try: file_string = handle.get(replica.bucket, file_blob_key).decode("utf-8") except BlobStoreError as ex: raise RuntimeError( f"{ex} This bundle will not be indexed. Bundle: {fqid}, File Blob Key: " f"{file_blob_key}, File Name: '{file_name}'" ) from ex try: file_json = json.loads(file_string) # TODO (mbaumann) Are there other JSON-related exceptions that should be checked below? except json.decoder.JSONDecodeError as ex: logger.warning( f"In bundle {fqid} the file '{file_name}' is marked for indexing yet could " f"not be parsed. This file will not be indexed. Exception: {ex}" ) else: logger.debug(f"Loaded file: {file_name}") index_files.append((file_name, file_json)) else: logger.warning( f"In bundle {fqid} the file '{file_name}' is marked for indexing yet has " f"content type '{content_type}' instead of the required content type " f"'application/json'. This file will not be indexed.") return index_files
def _test_file_get_checkout(self, replica: Replica, scheme: str, test_bucket: str, uploader: Uploader): handle = Config.get_blobstore_handle(replica) src_key = generate_test_key() src_data = os.urandom(1024) source_url = f"{scheme}://{test_bucket}/{src_key}" file_uuid = str(uuid.uuid4()) bundle_uuid = str(uuid.uuid4()) version = datetime_to_version_format(datetime.datetime.utcnow()) # write dummy file and upload to upload area with tempfile.NamedTemporaryFile(delete=True) as fh: fh.write(src_data) fh.flush() uploader.checksum_and_upload_file(fh.name, src_key, "text/plain") # upload file to DSS self.upload_file(source_url, file_uuid, bundle_uuid=bundle_uuid, version=version) url = str(UrlBuilder().set(path="/v1/files/" + file_uuid).add_query( "replica", replica.name).add_query("version", version)) # get uploaded blob key file_metadata = json.loads( handle.get(test_bucket, f"files/{file_uuid}.{version}").decode("utf-8")) file_key = compose_blob_key(file_metadata) @eventually(20, 1) def test_checkout(): # assert 302 and verify checksum on checkout completion api_get = self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) file_get = requests.get(api_get.response.headers['Location']) self.assertTrue(file_get.ok) self.assertEquals(file_get.content, src_data) with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on 'uncheckedout' file." ): # assert 301 redirect on first GET self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 301 for GET on nearly expired checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: blob_ttl_days = int(os.environ['DSS_BLOB_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days, hours=1, minutes=5) self.assertGetResponse(url, requests.codes.moved, headers=get_auth_header(), redirect_follow_retries=0) test_checkout() with self.subTest( f"{replica}: Initiates checkout and returns 302 immediately for GET on stale checkout file." ): now = datetime.datetime.now(datetime.timezone.utc) creation_date = handle.get_creation_date(replica.checkout_bucket, file_key) creation_date_fn = ( "cloud_blobstore.s3.S3BlobStore.get_creation_date" if replica.name == "aws" else "cloud_blobstore.gs.GSBlobStore.get_creation_date") with mock.patch(creation_date_fn) as mock_creation_date: # assert 302 found on stale file and that last modified refreshes blob_ttl_days = int(os.environ['DSS_BLOB_PUBLIC_TTL_DAYS']) mock_creation_date.return_value = now - datetime.timedelta( days=blob_ttl_days + 1) self.assertGetResponse(url, requests.codes.found, headers=get_auth_header(), redirect_follow_retries=0) self.assertTrue( creation_date > handle.get_creation_date( replica.checkout_bucket, file_key), f'\ncurr_creation_date: {creation_date}' f'\nprev_creation_date: {handle.get_creation_date(replica.checkout_bucket)}' ) handle.delete(test_bucket, f"files/{file_uuid}.{version}") handle.delete(replica.checkout_bucket, file_key)
def dependencies_exist(source_replica: Replica, dest_replica: Replica, key: str): """ Given a source replica and manifest key, checks if all dependencies of the corresponding DSS object are present in dest_replica: - Given a file manifest key, checks if blobs exist in dest_replica. - Given a bundle manifest key, checks if file manifests exist in dest_replica. - Given a collection key, checks if all collection contents exist in dest_replica. Returns true if all dependencies exist in dest_replica, false otherwise. """ source_handle = Config.get_blobstore_handle(source_replica) dest_handle = Config.get_blobstore_handle(dest_replica) if key.endswith(TOMBSTONE_SUFFIX): return True elif key.startswith(FILE_PREFIX): file_id = FileFQID.from_key(key) file_manifest = get_json_metadata( entity_type="file", uuid=file_id.uuid, version=file_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) blob_path = compose_blob_key(file_manifest) if exists(dest_replica, blob_path): return True elif key.startswith(BUNDLE_PREFIX): # head all file manifests bundle_id = BundleFQID.from_key(key) bundle_manifest = get_json_metadata( entity_type="bundle", uuid=bundle_id.uuid, version=bundle_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: with ThreadPoolExecutor(max_workers=20) as e: futures = list() for file in bundle_manifest[BundleMetadata.FILES]: file_uuid = file[BundleFileMetadata.UUID] file_version = file[BundleFileMetadata.VERSION] futures.append( e.submit(get_json_metadata, entity_type="file", uuid=file_uuid, version=file_version, replica=dest_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size)) for future in as_completed(futures): future.result() return True except Exception: pass elif key.startswith(COLLECTION_PREFIX): collection_id = CollectionFQID.from_key(key) collection_manifest = get_json_metadata( entity_type="collection", uuid=collection_id.uuid, version=collection_id.version, replica=source_replica, blobstore_handle=source_handle, max_metadata_size=max_syncable_metadata_size) try: verify_collection(contents=collection_manifest["contents"], replica=dest_replica, blobstore_handle=dest_handle) return True except Exception: pass else: raise NotImplementedError("Unknown prefix for key {}".format(key)) return False
def test_dependencies_exist(self): file_uuid, file_version = str(uuid.uuid4()), get_version() bundle_uuid, bundle_version = str(uuid.uuid4()), get_version() collection_data = { "contents": [{ "type": "bundle", "uuid": bundle_uuid, "version": bundle_version }, { "type": "file", "uuid": file_uuid, "version": file_version }] } bundle_data = { BundleMetadata.FILES: [{ BundleFileMetadata.UUID: file_uuid, BundleFileMetadata.VERSION: file_version }] } file_data = { FileMetadata.SHA256: "sync_test", FileMetadata.SHA1: "sync_test", FileMetadata.S3_ETAG: "sync_test", FileMetadata.CRC32C: str(uuid.uuid4()) } with self.subTest("collection without deps"): collection_key = "{}/{}".format(COLLECTION_PREFIX, get_collection_fqid()) collection_blob = self.s3_bucket.Object(collection_key) collection_blob.put(Body=json.dumps(collection_data).encode()) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) with self.subTest("bundle without deps"): bundle_key = "{}/{}".format( BUNDLE_PREFIX, BundleFQID(uuid=bundle_uuid, version=bundle_version)) bundle_blob = self.s3_bucket.Object(bundle_key) bundle_blob.put(Body=json.dumps(bundle_data).encode()) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) with self.subTest("file without deps"): file_key = "{}/{}".format( FILE_PREFIX, FileFQID(uuid=file_uuid, version=file_version)) file_blob = self.s3_bucket.Object(file_key) file_blob.put(Body=json.dumps(file_data).encode()) @eventually(timeout=8, interval=1, errors={Exception}) def check_file_revdeps(): self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) self.assertFalse( sync.dependencies_exist(Replica.aws, Replica.aws, file_key)) check_file_revdeps() with self.subTest( "blob presence causes all dependencies to be resolved"): blob_key = compose_blob_key(file_data) blob_blob = self.s3_bucket.Object(blob_key) blob_blob.put(Body=b"sync_test") @eventually(timeout=8, interval=1, errors={Exception}) def check_blob_revdeps(): self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, collection_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, bundle_key)) self.assertTrue( sync.dependencies_exist(Replica.aws, Replica.aws, file_key)) check_blob_revdeps()
def process_keys(self): """Verify that keys are in the checkout bucket.""" checkout_status = dict(replica=self.replica.name) for _key in self.keys: if DSS_BUNDLE_KEY_REGEX.match( _key): # handles bundles/fqid keys or fqid uuid, version = self._parse_key(_key) bundle_manifest = get_bundle_manifest(replica=self.replica, uuid=uuid, version=version) checkout_bundle_contents = [ x[0] for x in self.handle.list_v2( bucket=self.checkout_bucket, prefix=f'bundles/{uuid}.{version}') ] bundle_internal_status = list() for _file in bundle_manifest['files']: temp = collections.defaultdict(blob_checkout=False, bundle_checkout=False, should_be_cached=False) bundle_key = f'bundles/{uuid}.{version}/{_file["name"]}' blob_key = compose_blob_key(_file) blob_status = self._verify_blob_existance( self.handle, self.checkout_bucket, blob_key) if blob_status: temp['blob_checkout'] = True if bundle_key in checkout_bundle_contents: temp['bundle_checkout'] = True if cache_flow.should_cache_file(_file['content-type'], _file['size']): temp['should_be_cached'] = True for x in ['name', 'uuid', 'version']: temp.update({x: _file[x]}) bundle_internal_status.append(temp) checkout_status[_key] = bundle_internal_status elif _key.startswith(FILE_PREFIX): temp = collections.defaultdict(blob_checkout=False, should_be_cached=False) file_metadata = self._get_metadata(self.handle, self.replica.bucket, _key) if not file_metadata: sys.stderr.write( f'Key not in either main bucket or checkout bucket: {_key}' ) continue blob_key = compose_blob_key(file_metadata) blob_status = self._verify_blob_existance( self.handle, self.checkout_bucket, blob_key) if blob_status: temp['blob_checkout'] = True if cache_flow.should_cache_file(file_metadata['content-type'], file_metadata['size']): temp['should_be_cached'] = True for x in ['name', 'uuid', 'version']: temp.update({x: file_metadata[x]}) checkout_status[_key] = collections.defaultdict(uuid=temp) else: sys.stderr.write(f'Invalid key regex: {_key}') print(json.dumps(checkout_status, sort_keys=True, indent=2)) return checkout_status # action_handler does not really use this, its just testing
def get_helper(uuid: str, replica: Replica, version: str = None, token: str = None, directurl: bool = False, content_disposition: str = None): with tracing.Subsegment('parameterization'): handle = Config.get_blobstore_handle(replica) bucket = replica.bucket if version is None: with tracing.Subsegment('find_latest_version'): # list the files and find the one that is the most recent. prefix = "files/{}.".format(uuid) for matching_file in handle.list(bucket, prefix): matching_file = matching_file[len(prefix):] if version is None or matching_file > version: version = matching_file if version is None: # no matches! raise DSSException(404, "not_found", "Cannot find file!") # retrieve the file metadata. try: with tracing.Subsegment('load_file'): file_metadata = json.loads( handle.get( bucket, f"files/{uuid}.{version}" ).decode("utf-8")) except BlobNotFoundError: key = f"files/{uuid}.{version}" item = AsyncStateItem.get(key) if isinstance(item, S3CopyEtagError): raise DSSException( requests.codes.unprocessable, "missing_checksum", "Incorrect s3-etag" ) elif isinstance(item, AsyncStateError): raise item else: raise DSSException(404, "not_found", "Cannot find file!") with tracing.Subsegment('make_path'): blob_path = compose_blob_key(file_metadata) if request.method == "GET": token, ready = _verify_checkout(replica, token, file_metadata, blob_path) if ready: if directurl: response = redirect(str(UrlBuilder().set( scheme=replica.storage_schema, netloc=replica.checkout_bucket, path=get_dst_key(blob_path) ))) else: if content_disposition: # can tell a browser to treat the response link as a download rather than open a new tab response = redirect(handle.generate_presigned_GET_url( replica.checkout_bucket, get_dst_key(blob_path), response_content_disposition=content_disposition)) else: response = redirect(handle.generate_presigned_GET_url( replica.checkout_bucket, get_dst_key(blob_path))) else: with tracing.Subsegment('make_retry'): builder = UrlBuilder(request.url) builder.replace_query("token", token) response = redirect(str(builder), code=301) headers = response.headers headers['Retry-After'] = RETRY_AFTER_INTERVAL return response else: response = make_response('', 200) with tracing.Subsegment('set_headers'): headers = response.headers headers['X-DSS-CREATOR-UID'] = file_metadata[FileMetadata.CREATOR_UID] headers['X-DSS-VERSION'] = version headers['X-DSS-CONTENT-TYPE'] = file_metadata[FileMetadata.CONTENT_TYPE] headers['X-DSS-SIZE'] = file_metadata[FileMetadata.SIZE] headers['X-DSS-CRC32C'] = file_metadata[FileMetadata.CRC32C] headers['X-DSS-S3-ETAG'] = file_metadata[FileMetadata.S3_ETAG] headers['X-DSS-SHA1'] = file_metadata[FileMetadata.SHA1] headers['X-DSS-SHA256'] = file_metadata[FileMetadata.SHA256] return response