def testAbsoluteLocations(self): """Using a pathInStore that refers to absolute URI.""" loc = Location(None, "file:///something.txt") self.assertEqual(loc.pathInStore.path, "/something.txt") self.assertEqual(str(loc.uri), "file:///something.txt") with self.assertRaises(ValueError): Location(None, "relative.txt")
def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, ref: DatasetRef, isComponent: bool = False) -> Any: location = getInfo.location log.debug("Downloading data from %s", location.uri) serializedDataset = location.uri.read() storedFileInfo = getInfo.info if len(serializedDataset) != storedFileInfo.file_size: raise RuntimeError( "Integrity failure in Datastore. " f"Size of file {location.path} ({len(serializedDataset)}) " f"does not match recorded size of {storedFileInfo.file_size}") # format the downloaded bytes into appropriate object directly, or via # tempfile (when formatter does not support to/from/Bytes). This is # equivalent of PosixDatastore formatter.read try-except block. formatter = getInfo.formatter try: result = formatter.fromBytes( serializedDataset, component=getInfo.component if isComponent else None) except NotImplementedError: # formatter might not always have an extension so mypy complains # We can either ignore the complaint or use a temporary location tmpLoc = Location(".", "temp") tmpLoc = formatter.makeUpdatedLocation(tmpLoc) with tempfile.NamedTemporaryFile( suffix=tmpLoc.getExtension()) as tmpFile: tmpFile.write(serializedDataset) # Flush the write. Do not close the file because that # will delete it. tmpFile.flush() formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) result = formatter.read( component=getInfo.component if isComponent else None) except Exception as e: raise ValueError( f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" f" ({ref.datasetType.name} from {location.uri}): {e}") from e return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent)
def testFileExists(self): self.assertTrue( s3CheckFileExists(client=self.client, bucket=self.bucketName, path=self.fileName)[0]) self.assertFalse( s3CheckFileExists(client=self.client, bucket=self.bucketName, path=self.fileName + "_NO_EXIST")[0]) datastoreRootUri = f"s3://{self.bucketName}/" uri = f"s3://{self.bucketName}/{self.fileName}" buri = ButlerURI(uri) location = Location(datastoreRootUri, self.fileName) self.assertTrue(s3CheckFileExists(client=self.client, path=buri)[0]) # just to make sure the overloaded keyword works correctly self.assertTrue(s3CheckFileExists(buri, client=self.client)[0]) self.assertTrue( s3CheckFileExists(client=self.client, path=location)[0]) # make sure supplying strings resolves correctly too self.assertTrue(s3CheckFileExists(uri, client=self.client)) self.assertTrue(s3CheckFileExists(uri))
def setUp(self): self.id = 0 self.factory = FormatterFactory() # Dummy FileDescriptor for testing getFormatter self.fileDescriptor = FileDescriptor(Location("/a/b/c", "d"), StorageClass("DummyStorageClass", dict, None))
def testFormatter(self): """Check basic parameter exceptions""" f = DoNothingFormatter(self.fileDescriptor, self.dataId) self.assertEqual(f.writeRecipes, {}) self.assertEqual(f.writeParameters, {}) self.assertIn("DoNothingFormatter", repr(f)) with self.assertRaises(TypeError): DoNothingFormatter() with self.assertRaises(ValueError): DoNothingFormatter(self.fileDescriptor, self.dataId, writeParameters={"param1": 0}) with self.assertRaises(RuntimeError): DoNothingFormatter(self.fileDescriptor, self.dataId, writeRecipes={"label": "value"}) with self.assertRaises(NotImplementedError): f.makeUpdatedLocation(Location("a", "b")) with self.assertRaises(NotImplementedError): f.write("str")
def put(self, inMemoryDataset, ref): """Write a InMemoryDataset with a given `DatasetRef` to the store. Parameters ---------- inMemoryDataset : `object` The Dataset to store. ref : `DatasetRef` Reference to the associated Dataset. Raises ------ TypeError Supplied object and storage class are inconsistent. DatasetTypeNotSupportedError The associated `DatasetType` is not handled by this datastore. Notes ----- If the datastore is configured to reject certain dataset types it is possible that the put will fail and raise a `DatasetTypeNotSupportedError`. The main use case for this is to allow `ChainedDatastore` to put to multiple datastores without requiring that every datastore accepts the dataset. """ location, formatter = self._prepare_for_put(inMemoryDataset, ref) # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 # `Keys` instead only look like directories, but are not. We check if # an *exact* full key already exists before writing instead. The insert # key operation is equivalent to creating the dir and the file. location.updateExtension(formatter.extension) if s3CheckFileExists(location, client=self.client,)[0]: raise FileExistsError(f"Cannot write file for ref {ref} as " f"output file {location.uri} exists.") # upload the file directly from bytes or by using a temporary file if # _toBytes is not implemented try: serializedDataset = formatter.toBytes(inMemoryDataset) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=serializedDataset) log.debug("Wrote file directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile: formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name)) formatter.write(inMemoryDataset) self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot, Filename=tmpFile.name) log.debug("Wrote file to %s via a temporary directory.", location.uri) # Register a callback to try to delete the uploaded data if # the ingest fails below self._transaction.registerUndo("write", self.client.delete_object, Bucket=location.netloc, Key=location.relativeToPathRoot) # URI is needed to resolve what ingest case are we dealing with info = self._extractIngestInfo(location.uri, ref, formatter=formatter) self._register_datasets([(ref, info)])
def setUp(self): self.id = 0 self.factory = FormatterFactory() self.universe = DimensionUniverse() self.dataId = DataCoordinate.makeEmpty(self.universe) # Dummy FileDescriptor for testing getFormatter self.fileDescriptor = FileDescriptor( Location("/a/b/c", "d"), StorageClass("DummyStorageClass", dict, None))
def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: location, formatter = self._prepare_for_put(inMemoryDataset, ref) # in PosixDatastore a directory can be created by `safeMakeDir`. In S3 # `Keys` instead only look like directories, but are not. We check if # an *exact* full key already exists before writing instead. The insert # key operation is equivalent to creating the dir and the file. if s3CheckFileExists( location, client=self.client, )[0]: raise FileExistsError(f"Cannot write file for ref {ref} as " f"output file {location.uri} exists.") # upload the file directly from bytes or by using a temporary file if # _toBytes is not implemented try: serializedDataset = formatter.toBytes(inMemoryDataset) log.debug("Writing file directly to %s", location.uri) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=serializedDataset) log.debug("Successfully wrote file directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile( suffix=location.getExtension()) as tmpFile: formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) formatter.write(inMemoryDataset) with open(tmpFile.name, 'rb') as f: log.debug("Writing file to %s via a temporary directory.", location.uri) self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot, Body=f) log.debug( "Successfully wrote file to %s via a temporary directory.", location.uri) if self._transaction is None: raise RuntimeError( "Attempting to write artifact without transaction enabled") # Register a callback to try to delete the uploaded data if # the ingest fails below self._transaction.registerUndo("write", self.client.delete_object, Bucket=location.netloc, Key=location.relativeToPathRoot) # URI is needed to resolve what ingest case are we dealing with return self._extractIngestInfo(location.uri, ref, formatter=formatter)
def testGetFileURL(self): s = f"https://{self.serverRoot}/{self.existingfolderName}/{self.existingfileName}" buri = ButlerURI( f"https://{self.serverRoot}/{self.existingfolderName}/{self.existingfileName}" ) loc = Location(f"https://{self.serverRoot}/", f"{self.existingfolderName}/{self.existingfileName}") self.assertEqual(_getFileURL(s), s) self.assertEqual(_getFileURL(buri), s) self.assertEqual(_getFileURL(loc), s)
def _write_in_memory_to_artifact(self, inMemoryDataset: Any, ref: DatasetRef) -> StoredFileInfo: location, formatter = self._prepare_for_put(inMemoryDataset, ref) if location.uri.exists(): # Assume that by this point if registry thinks the file should # not exist then the file should not exist and therefore we can # overwrite it. This can happen if a put was interrupted by # an external interrupt. The only time this could be problematic is # if the file template is incomplete and multiple dataset refs # result in identical filenames. # Eventually we should remove the check completely (it takes # non-zero time for network). log.warning("Object %s exists in datastore for ref %s", location.uri, ref) if not location.uri.dirname().exists(): log.debug("Folder %s does not exist yet.", location.uri.dirname()) location.uri.dirname().mkdir() if self._transaction is None: raise RuntimeError( "Attempting to write artifact without transaction enabled") # upload the file directly from bytes or by using a temporary file if # _toBytes is not implemented try: serializedDataset = formatter.toBytes(inMemoryDataset) log.debug("Writing bytes directly to %s", location.uri) location.uri.write(serializedDataset, overwrite=True) log.debug("Successfully wrote bytes directly to %s", location.uri) except NotImplementedError: with tempfile.NamedTemporaryFile( suffix=location.getExtension()) as tmpFile: tmpLocation = Location(*os.path.split(tmpFile.name)) formatter._fileDescriptor.location = tmpLocation log.debug("Writing dataset to temporary directory at ", tmpLocation.uri) formatter.write(inMemoryDataset) location.uri.transfer_from(tmpLocation.uri, transfer="copy", overwrite=True) log.debug("Successfully wrote dataset to %s via a temporary file.", location.uri) # Register a callback to try to delete the uploaded data if # the ingest fails below self._transaction.registerUndo("remoteWrite", location.uri.remove) # URI is needed to resolve what ingest case are we dealing with return self._extractIngestInfo(location.uri, ref, formatter=formatter)
def testExtensionValidation(self): """Test extension validation""" for file, single_ok, multi_ok in (("e.fits", True, True), ("e.fit", False, True), ("e.fits.fz", False, True), ("e.txt", False, False), ("e.1.4.fits", True, True), ("e.3.fit", False, True), ("e.1.4.fits.gz", False, True), ): loc = Location("/a/b/c", file) for formatter, passes in ((SingleExtensionFormatter, single_ok), (MultipleExtensionsFormatter, multi_ok)): if passes: formatter.validateExtension(loc) else: with self.assertRaises(ValueError): formatter.validateExtension(loc)
def get(self, ref, parameters=None): """Load an InMemoryDataset from the store. Parameters ---------- ref : `DatasetRef` Reference to the required Dataset. parameters : `dict` `StorageClass`-specific parameters that specify, for example, a slice of the Dataset to be loaded. Returns ------- inMemoryDataset : `object` Requested Dataset or slice thereof as an InMemoryDataset. Raises ------ FileNotFoundError Requested dataset can not be retrieved. TypeError Return value from formatter has unexpected type. ValueError Formatter failed to process the dataset. """ getInfo = self._prepare_for_get(ref, parameters) location = getInfo.location # since we have to make a GET request to S3 anyhow (for download) we # might as well use the HEADER metadata for size comparison instead. # s3CheckFileExists would just duplicate GET/LIST charges in this case. try: response = self.client.get_object(Bucket=location.netloc, Key=location.relativeToPathRoot) except self.client.exceptions.ClientError as err: errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] # head_object returns 404 when object does not exist only when user # has s3:ListBucket permission. If list permission does not exist a # 403 is returned. In practical terms this usually means that the # file does not exist, but it could also mean user lacks GetObject # permission. It's hard to tell which case is it. # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html # Unit tests right now demand FileExistsError is raised, but this # should be updated to PermissionError like in s3CheckFileExists. if errorcode == 403: raise FileNotFoundError( f"Dataset with Id {ref.id} not accessible at " f"expected location {location}. Forbidden HEAD " "operation error occured. Verify s3:ListBucket " "and s3:GetObject permissions are granted for " "your IAM user and that file exists. ") from err if errorcode == 404: errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." raise FileNotFoundError(errmsg) from err # other errors are reraised also, but less descriptively raise err storedFileInfo = getInfo.info if response["ContentLength"] != storedFileInfo.file_size: raise RuntimeError( "Integrity failure in Datastore. Size of file {} ({}) does not" " match recorded size of {}".format(location.path, response["ContentLength"], storedFileInfo.file_size)) # download the data as bytes serializedDataset = response["Body"].read() # format the downloaded bytes into appropriate object directly, or via # tempfile (when formatter does not support to/from/Bytes). This is S3 # equivalent of PosixDatastore formatter.read try-except block. formatter = getInfo.formatter try: result = formatter.fromBytes(serializedDataset, component=getInfo.component) except NotImplementedError: with tempfile.NamedTemporaryFile( suffix=formatter.extension) as tmpFile: tmpFile.file.write(serializedDataset) formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) result = formatter.read(component=getInfo.component) except Exception as e: raise ValueError( f"Failure from formatter for Dataset {ref.id}: {e}") from e return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams)
def _read_artifact_into_memory(self, getInfo: DatastoreFileGetInformation, ref: DatasetRef, isComponent: bool = False) -> Any: location = getInfo.location # since we have to make a GET request to S3 anyhow (for download) we # might as well use the HEADER metadata for size comparison instead. # s3CheckFileExists would just duplicate GET/LIST charges in this case. try: log.debug("Reading file: %s", location.uri) response = self.client.get_object(Bucket=location.netloc, Key=location.relativeToPathRoot) log.debug("Successfully read file: %s", location.uri) except self.client.exceptions.ClientError as err: errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"] # head_object returns 404 when object does not exist only when user # has s3:ListBucket permission. If list permission does not exist a # 403 is returned. In practical terms this usually means that the # file does not exist, but it could also mean user lacks GetObject # permission. It's hard to tell which case is it. # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html # Unit tests right now demand FileExistsError is raised, but this # should be updated to PermissionError like in s3CheckFileExists. if errorcode == 403: raise FileNotFoundError( f"Dataset with Id {ref.id} not accessible at " f"expected location {location}. Forbidden HEAD " "operation error occured. Verify s3:ListBucket " "and s3:GetObject permissions are granted for " "your IAM user and that file exists. ") from err if errorcode == 404: errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}." raise FileNotFoundError(errmsg) from err # other errors are reraised also, but less descriptively raise err storedFileInfo = getInfo.info if response["ContentLength"] != storedFileInfo.file_size: raise RuntimeError( "Integrity failure in Datastore. Size of file {} ({}) does not" " match recorded size of {}".format(location.path, response["ContentLength"], storedFileInfo.file_size)) # download the data as bytes serializedDataset = response["Body"].read() # format the downloaded bytes into appropriate object directly, or via # tempfile (when formatter does not support to/from/Bytes). This is S3 # equivalent of PosixDatastore formatter.read try-except block. formatter = getInfo.formatter try: result = formatter.fromBytes( serializedDataset, component=getInfo.component if isComponent else None) except NotImplementedError: # formatter might not always have an extension so mypy complains # We can either ignore the complaint or use a temporary location tmpLoc = Location(".", "temp") tmpLoc = formatter.makeUpdatedLocation(tmpLoc) with tempfile.NamedTemporaryFile( suffix=tmpLoc.getExtension()) as tmpFile: tmpFile.write(serializedDataset) # Flush the write. Do not close the file because that # will delete it. tmpFile.flush() formatter._fileDescriptor.location = Location( *os.path.split(tmpFile.name)) result = formatter.read( component=getInfo.component if isComponent else None) except Exception as e: raise ValueError( f"Failure from formatter '{formatter.name()}' for dataset {ref.id}" f" ({ref.datasetType.name} from {location.uri}): {e}") from e return self._post_process_get(result, getInfo.readStorageClass, getInfo.assemblerParams, isComponent=isComponent)
def checkInstrumentWithRegistry(self, cls, testRaw): Butler.makeRepo(self.root) butler = Butler(self.root, run="tests") instrument = cls() scFactory = StorageClassFactory() # Check instrument class and metadata translator agree on # instrument name -- use the raw formatter to do the file reading rawFormatterClass = instrument.getRawFormatter({}) formatter = rawFormatterClass( FileDescriptor(Location(DATAROOT, testRaw), StorageClass("x"))) obsInfo = formatter.observationInfo self.assertEqual(instrument.getName(), obsInfo.instrument) # Add Instrument, Detector, and PhysicalFilter entries to the # Butler Registry. instrument.register(butler.registry) # Define a DatasetType for the cameraGeom.Camera, which can be # accessed just by identifying its Instrument. # A real-world Camera DatasetType should be identified by a # validity range as well. cameraDatasetType = DatasetType( "camera", dimensions=["instrument"], storageClass=scFactory.getStorageClass("Camera"), universe=butler.registry.dimensions) butler.registry.registerDatasetType(cameraDatasetType) # Define a DatasetType for cameraGeom.Detectors, which can be # accessed by identifying its Instrument and (Butler) Detector. # A real-world Detector DatasetType probably doesn't need to exist, # as it would just duplicate information in the Camera, and # reading a full Camera just to get a single Detector should be # plenty efficient. detectorDatasetType = DatasetType( "detector", dimensions=["instrument", "detector"], storageClass=scFactory.getStorageClass("Detector"), universe=butler.registry.dimensions) butler.registry.registerDatasetType(detectorDatasetType) # Put and get the Camera. dataId = dict(instrument=instrument.instrument) butler.put(instrument.getCamera(), "camera", dataId=dataId) camera = butler.get("camera", dataId) # Full camera comparisons are *slow*; just compare names. self.assertEqual(instrument.getCamera().getName(), camera.getName()) # Put and get a random subset of the Detectors. allDetectors = list(instrument.getCamera()) numDetectors = min(3, len(allDetectors)) someDetectors = [ allDetectors[i] for i in self.rng.choice( len(allDetectors), size=numDetectors, replace=False) ] for cameraGeomDetector in someDetectors: # Right now we only support integer detector IDs in data IDs; # support for detector names and groups (i.e. rafts) is # definitely planned but not yet implemented. dataId = dict(instrument=instrument.instrument, detector=cameraGeomDetector.getId()) butler.put(cameraGeomDetector, "detector", dataId=dataId) cameraGeomDetector2 = butler.get("detector", dataId=dataId) # Full detector comparisons are *slow*; just compare names and # serials. self.assertEqual(cameraGeomDetector.getName(), cameraGeomDetector2.getName()) self.assertEqual(cameraGeomDetector.getSerial(), cameraGeomDetector2.getSerial())