Esempio n. 1
0
    def testAbsoluteLocations(self):
        """Using a pathInStore that refers to absolute URI."""
        loc = Location(None, "file:///something.txt")
        self.assertEqual(loc.pathInStore.path, "/something.txt")
        self.assertEqual(str(loc.uri), "file:///something.txt")

        with self.assertRaises(ValueError):
            Location(None, "relative.txt")
Esempio n. 2
0
    def _read_artifact_into_memory(self,
                                   getInfo: DatastoreFileGetInformation,
                                   ref: DatasetRef,
                                   isComponent: bool = False) -> Any:
        location = getInfo.location

        log.debug("Downloading data from %s", location.uri)
        serializedDataset = location.uri.read()

        storedFileInfo = getInfo.info
        if len(serializedDataset) != storedFileInfo.file_size:
            raise RuntimeError(
                "Integrity failure in Datastore. "
                f"Size of file {location.path} ({len(serializedDataset)}) "
                f"does not match recorded size of {storedFileInfo.file_size}")

        # format the downloaded bytes into appropriate object directly, or via
        # tempfile (when formatter does not support to/from/Bytes). This is
        # equivalent of PosixDatastore formatter.read try-except block.
        formatter = getInfo.formatter
        try:
            result = formatter.fromBytes(
                serializedDataset,
                component=getInfo.component if isComponent else None)
        except NotImplementedError:
            # formatter might not always have an extension so mypy complains
            # We can either ignore the complaint or use a temporary location
            tmpLoc = Location(".", "temp")
            tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
            with tempfile.NamedTemporaryFile(
                    suffix=tmpLoc.getExtension()) as tmpFile:
                tmpFile.write(serializedDataset)
                # Flush the write. Do not close the file because that
                # will delete it.
                tmpFile.flush()
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                result = formatter.read(
                    component=getInfo.component if isComponent else None)
        except Exception as e:
            raise ValueError(
                f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
                f" ({ref.datasetType.name} from {location.uri}): {e}") from e

        return self._post_process_get(result,
                                      getInfo.readStorageClass,
                                      getInfo.assemblerParams,
                                      isComponent=isComponent)
Esempio n. 3
0
    def testFileExists(self):
        self.assertTrue(
            s3CheckFileExists(client=self.client,
                              bucket=self.bucketName,
                              path=self.fileName)[0])
        self.assertFalse(
            s3CheckFileExists(client=self.client,
                              bucket=self.bucketName,
                              path=self.fileName + "_NO_EXIST")[0])

        datastoreRootUri = f"s3://{self.bucketName}/"
        uri = f"s3://{self.bucketName}/{self.fileName}"

        buri = ButlerURI(uri)
        location = Location(datastoreRootUri, self.fileName)

        self.assertTrue(s3CheckFileExists(client=self.client, path=buri)[0])
        # just to make sure the overloaded keyword works correctly
        self.assertTrue(s3CheckFileExists(buri, client=self.client)[0])
        self.assertTrue(
            s3CheckFileExists(client=self.client, path=location)[0])

        # make sure supplying strings resolves correctly too
        self.assertTrue(s3CheckFileExists(uri, client=self.client))
        self.assertTrue(s3CheckFileExists(uri))
Esempio n. 4
0
    def setUp(self):
        self.id = 0
        self.factory = FormatterFactory()

        # Dummy FileDescriptor for testing getFormatter
        self.fileDescriptor = FileDescriptor(Location("/a/b/c", "d"),
                                             StorageClass("DummyStorageClass", dict, None))
Esempio n. 5
0
    def testFormatter(self):
        """Check basic parameter exceptions"""
        f = DoNothingFormatter(self.fileDescriptor, self.dataId)
        self.assertEqual(f.writeRecipes, {})
        self.assertEqual(f.writeParameters, {})
        self.assertIn("DoNothingFormatter", repr(f))

        with self.assertRaises(TypeError):
            DoNothingFormatter()

        with self.assertRaises(ValueError):
            DoNothingFormatter(self.fileDescriptor,
                               self.dataId,
                               writeParameters={"param1": 0})

        with self.assertRaises(RuntimeError):
            DoNothingFormatter(self.fileDescriptor,
                               self.dataId,
                               writeRecipes={"label": "value"})

        with self.assertRaises(NotImplementedError):
            f.makeUpdatedLocation(Location("a", "b"))

        with self.assertRaises(NotImplementedError):
            f.write("str")
Esempio n. 6
0
    def put(self, inMemoryDataset, ref):
        """Write a InMemoryDataset with a given `DatasetRef` to the store.

        Parameters
        ----------
        inMemoryDataset : `object`
            The Dataset to store.
        ref : `DatasetRef`
            Reference to the associated Dataset.

        Raises
        ------
        TypeError
            Supplied object and storage class are inconsistent.
        DatasetTypeNotSupportedError
            The associated `DatasetType` is not handled by this datastore.

        Notes
        -----
        If the datastore is configured to reject certain dataset types it
        is possible that the put will fail and raise a
        `DatasetTypeNotSupportedError`.  The main use case for this is to
        allow `ChainedDatastore` to put to multiple datastores without
        requiring that every datastore accepts the dataset.
        """
        location, formatter = self._prepare_for_put(inMemoryDataset, ref)

        # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
        # `Keys` instead only look like directories, but are not. We check if
        # an *exact* full key already exists before writing instead. The insert
        # key operation is equivalent to creating the dir and the file.
        location.updateExtension(formatter.extension)
        if s3CheckFileExists(location, client=self.client,)[0]:
            raise FileExistsError(f"Cannot write file for ref {ref} as "
                                  f"output file {location.uri} exists.")

        # upload the file directly from bytes or by using a temporary file if
        # _toBytes is not implemented
        try:
            serializedDataset = formatter.toBytes(inMemoryDataset)
            self.client.put_object(Bucket=location.netloc, Key=location.relativeToPathRoot,
                                   Body=serializedDataset)
            log.debug("Wrote file directly to %s", location.uri)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(suffix=formatter.extension) as tmpFile:
                formatter._fileDescriptor.location = Location(*os.path.split(tmpFile.name))
                formatter.write(inMemoryDataset)
                self.client.upload_file(Bucket=location.netloc, Key=location.relativeToPathRoot,
                                        Filename=tmpFile.name)
                log.debug("Wrote file to %s via a temporary directory.", location.uri)

        # Register a callback to try to delete the uploaded data if
        # the ingest fails below
        self._transaction.registerUndo("write", self.client.delete_object,
                                       Bucket=location.netloc, Key=location.relativeToPathRoot)

        # URI is needed to resolve what ingest case are we dealing with
        info = self._extractIngestInfo(location.uri, ref, formatter=formatter)
        self._register_datasets([(ref, info)])
Esempio n. 7
0
    def setUp(self):
        self.id = 0
        self.factory = FormatterFactory()
        self.universe = DimensionUniverse()
        self.dataId = DataCoordinate.makeEmpty(self.universe)

        # Dummy FileDescriptor for testing getFormatter
        self.fileDescriptor = FileDescriptor(
            Location("/a/b/c", "d"),
            StorageClass("DummyStorageClass", dict, None))
Esempio n. 8
0
    def _write_in_memory_to_artifact(self, inMemoryDataset: Any,
                                     ref: DatasetRef) -> StoredFileInfo:
        location, formatter = self._prepare_for_put(inMemoryDataset, ref)

        # in PosixDatastore a directory can be created by `safeMakeDir`. In S3
        # `Keys` instead only look like directories, but are not. We check if
        # an *exact* full key already exists before writing instead. The insert
        # key operation is equivalent to creating the dir and the file.
        if s3CheckFileExists(
                location,
                client=self.client,
        )[0]:
            raise FileExistsError(f"Cannot write file for ref {ref} as "
                                  f"output file {location.uri} exists.")

        # upload the file directly from bytes or by using a temporary file if
        # _toBytes is not implemented
        try:
            serializedDataset = formatter.toBytes(inMemoryDataset)
            log.debug("Writing file directly to %s", location.uri)
            self.client.put_object(Bucket=location.netloc,
                                   Key=location.relativeToPathRoot,
                                   Body=serializedDataset)
            log.debug("Successfully wrote file directly to %s", location.uri)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(
                    suffix=location.getExtension()) as tmpFile:
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                formatter.write(inMemoryDataset)
                with open(tmpFile.name, 'rb') as f:
                    log.debug("Writing file to %s via a temporary directory.",
                              location.uri)
                    self.client.put_object(Bucket=location.netloc,
                                           Key=location.relativeToPathRoot,
                                           Body=f)
                log.debug(
                    "Successfully wrote file to %s via a temporary directory.",
                    location.uri)

        if self._transaction is None:
            raise RuntimeError(
                "Attempting to write artifact without transaction enabled")

        # Register a callback to try to delete the uploaded data if
        # the ingest fails below
        self._transaction.registerUndo("write",
                                       self.client.delete_object,
                                       Bucket=location.netloc,
                                       Key=location.relativeToPathRoot)

        # URI is needed to resolve what ingest case are we dealing with
        return self._extractIngestInfo(location.uri, ref, formatter=formatter)
Esempio n. 9
0
    def testGetFileURL(self):

        s = f"https://{self.serverRoot}/{self.existingfolderName}/{self.existingfileName}"
        buri = ButlerURI(
            f"https://{self.serverRoot}/{self.existingfolderName}/{self.existingfileName}"
        )
        loc = Location(f"https://{self.serverRoot}/",
                       f"{self.existingfolderName}/{self.existingfileName}")

        self.assertEqual(_getFileURL(s), s)
        self.assertEqual(_getFileURL(buri), s)
        self.assertEqual(_getFileURL(loc), s)
Esempio n. 10
0
    def _write_in_memory_to_artifact(self, inMemoryDataset: Any,
                                     ref: DatasetRef) -> StoredFileInfo:
        location, formatter = self._prepare_for_put(inMemoryDataset, ref)

        if location.uri.exists():
            # Assume that by this point if registry thinks the file should
            # not exist then the file should not exist and therefore we can
            # overwrite it. This can happen if a put was interrupted by
            # an external interrupt. The only time this could be problematic is
            # if the file template is incomplete and multiple dataset refs
            # result in identical filenames.
            # Eventually we should remove the check completely (it takes
            # non-zero time for network).
            log.warning("Object %s exists in datastore for ref %s",
                        location.uri, ref)

        if not location.uri.dirname().exists():
            log.debug("Folder %s does not exist yet.", location.uri.dirname())
            location.uri.dirname().mkdir()

        if self._transaction is None:
            raise RuntimeError(
                "Attempting to write artifact without transaction enabled")

        # upload the file directly from bytes or by using a temporary file if
        # _toBytes is not implemented
        try:
            serializedDataset = formatter.toBytes(inMemoryDataset)
            log.debug("Writing bytes directly to %s", location.uri)
            location.uri.write(serializedDataset, overwrite=True)
            log.debug("Successfully wrote bytes directly to %s", location.uri)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(
                    suffix=location.getExtension()) as tmpFile:
                tmpLocation = Location(*os.path.split(tmpFile.name))
                formatter._fileDescriptor.location = tmpLocation
                log.debug("Writing dataset to temporary directory at ",
                          tmpLocation.uri)
                formatter.write(inMemoryDataset)
                location.uri.transfer_from(tmpLocation.uri,
                                           transfer="copy",
                                           overwrite=True)
            log.debug("Successfully wrote dataset to %s via a temporary file.",
                      location.uri)

        # Register a callback to try to delete the uploaded data if
        # the ingest fails below
        self._transaction.registerUndo("remoteWrite", location.uri.remove)

        # URI is needed to resolve what ingest case are we dealing with
        return self._extractIngestInfo(location.uri, ref, formatter=formatter)
Esempio n. 11
0
    def testExtensionValidation(self):
        """Test extension validation"""

        for file, single_ok, multi_ok in (("e.fits", True, True),
                                          ("e.fit", False, True),
                                          ("e.fits.fz", False, True),
                                          ("e.txt", False, False),
                                          ("e.1.4.fits", True, True),
                                          ("e.3.fit", False, True),
                                          ("e.1.4.fits.gz", False, True),
                                          ):
            loc = Location("/a/b/c", file)

            for formatter, passes in ((SingleExtensionFormatter, single_ok),
                                      (MultipleExtensionsFormatter, multi_ok)):
                if passes:
                    formatter.validateExtension(loc)
                else:
                    with self.assertRaises(ValueError):
                        formatter.validateExtension(loc)
Esempio n. 12
0
    def get(self, ref, parameters=None):
        """Load an InMemoryDataset from the store.

        Parameters
        ----------
        ref : `DatasetRef`
            Reference to the required Dataset.
        parameters : `dict`
            `StorageClass`-specific parameters that specify, for example,
            a slice of the Dataset to be loaded.

        Returns
        -------
        inMemoryDataset : `object`
            Requested Dataset or slice thereof as an InMemoryDataset.

        Raises
        ------
        FileNotFoundError
            Requested dataset can not be retrieved.
        TypeError
            Return value from formatter has unexpected type.
        ValueError
            Formatter failed to process the dataset.
        """
        getInfo = self._prepare_for_get(ref, parameters)
        location = getInfo.location

        # since we have to make a GET request to S3 anyhow (for download) we
        # might as well use the HEADER metadata for size comparison instead.
        # s3CheckFileExists would just duplicate GET/LIST charges in this case.
        try:
            response = self.client.get_object(Bucket=location.netloc,
                                              Key=location.relativeToPathRoot)
        except self.client.exceptions.ClientError as err:
            errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
            # head_object returns 404 when object does not exist only when user
            # has s3:ListBucket permission. If list permission does not exist a
            # 403 is returned. In practical terms this usually means that the
            # file does not exist, but it could also mean user lacks GetObject
            # permission. It's hard to tell which case is it.
            # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
            # Unit tests right now demand FileExistsError is raised, but this
            # should be updated to PermissionError like in s3CheckFileExists.
            if errorcode == 403:
                raise FileNotFoundError(
                    f"Dataset with Id {ref.id} not accessible at "
                    f"expected location {location}. Forbidden HEAD "
                    "operation error occured. Verify s3:ListBucket "
                    "and s3:GetObject permissions are granted for "
                    "your IAM user and that file exists. ") from err
            if errorcode == 404:
                errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
                raise FileNotFoundError(errmsg) from err
            # other errors are reraised also, but less descriptively
            raise err

        storedFileInfo = getInfo.info
        if response["ContentLength"] != storedFileInfo.file_size:
            raise RuntimeError(
                "Integrity failure in Datastore. Size of file {} ({}) does not"
                " match recorded size of {}".format(location.path,
                                                    response["ContentLength"],
                                                    storedFileInfo.file_size))

        # download the data as bytes
        serializedDataset = response["Body"].read()

        # format the downloaded bytes into appropriate object directly, or via
        # tempfile (when formatter does not support to/from/Bytes). This is S3
        # equivalent of PosixDatastore formatter.read try-except block.
        formatter = getInfo.formatter
        try:
            result = formatter.fromBytes(serializedDataset,
                                         component=getInfo.component)
        except NotImplementedError:
            with tempfile.NamedTemporaryFile(
                    suffix=formatter.extension) as tmpFile:
                tmpFile.file.write(serializedDataset)
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                result = formatter.read(component=getInfo.component)
        except Exception as e:
            raise ValueError(
                f"Failure from formatter for Dataset {ref.id}: {e}") from e

        return self._post_process_get(result, getInfo.readStorageClass,
                                      getInfo.assemblerParams)
Esempio n. 13
0
    def _read_artifact_into_memory(self,
                                   getInfo: DatastoreFileGetInformation,
                                   ref: DatasetRef,
                                   isComponent: bool = False) -> Any:
        location = getInfo.location

        # since we have to make a GET request to S3 anyhow (for download) we
        # might as well use the HEADER metadata for size comparison instead.
        # s3CheckFileExists would just duplicate GET/LIST charges in this case.
        try:
            log.debug("Reading file: %s", location.uri)
            response = self.client.get_object(Bucket=location.netloc,
                                              Key=location.relativeToPathRoot)
            log.debug("Successfully read file: %s", location.uri)
        except self.client.exceptions.ClientError as err:
            errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
            # head_object returns 404 when object does not exist only when user
            # has s3:ListBucket permission. If list permission does not exist a
            # 403 is returned. In practical terms this usually means that the
            # file does not exist, but it could also mean user lacks GetObject
            # permission. It's hard to tell which case is it.
            # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
            # Unit tests right now demand FileExistsError is raised, but this
            # should be updated to PermissionError like in s3CheckFileExists.
            if errorcode == 403:
                raise FileNotFoundError(
                    f"Dataset with Id {ref.id} not accessible at "
                    f"expected location {location}. Forbidden HEAD "
                    "operation error occured. Verify s3:ListBucket "
                    "and s3:GetObject permissions are granted for "
                    "your IAM user and that file exists. ") from err
            if errorcode == 404:
                errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
                raise FileNotFoundError(errmsg) from err
            # other errors are reraised also, but less descriptively
            raise err

        storedFileInfo = getInfo.info
        if response["ContentLength"] != storedFileInfo.file_size:
            raise RuntimeError(
                "Integrity failure in Datastore. Size of file {} ({}) does not"
                " match recorded size of {}".format(location.path,
                                                    response["ContentLength"],
                                                    storedFileInfo.file_size))

        # download the data as bytes
        serializedDataset = response["Body"].read()

        # format the downloaded bytes into appropriate object directly, or via
        # tempfile (when formatter does not support to/from/Bytes). This is S3
        # equivalent of PosixDatastore formatter.read try-except block.
        formatter = getInfo.formatter
        try:
            result = formatter.fromBytes(
                serializedDataset,
                component=getInfo.component if isComponent else None)
        except NotImplementedError:
            # formatter might not always have an extension so mypy complains
            # We can either ignore the complaint or use a temporary location
            tmpLoc = Location(".", "temp")
            tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
            with tempfile.NamedTemporaryFile(
                    suffix=tmpLoc.getExtension()) as tmpFile:
                tmpFile.write(serializedDataset)
                # Flush the write. Do not close the file because that
                # will delete it.
                tmpFile.flush()
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                result = formatter.read(
                    component=getInfo.component if isComponent else None)
        except Exception as e:
            raise ValueError(
                f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
                f" ({ref.datasetType.name} from {location.uri}): {e}") from e

        return self._post_process_get(result,
                                      getInfo.readStorageClass,
                                      getInfo.assemblerParams,
                                      isComponent=isComponent)
Esempio n. 14
0
    def checkInstrumentWithRegistry(self, cls, testRaw):

        Butler.makeRepo(self.root)
        butler = Butler(self.root, run="tests")
        instrument = cls()
        scFactory = StorageClassFactory()

        # Check instrument class and metadata translator agree on
        # instrument name -- use the raw formatter to do the file reading
        rawFormatterClass = instrument.getRawFormatter({})
        formatter = rawFormatterClass(
            FileDescriptor(Location(DATAROOT, testRaw), StorageClass("x")))
        obsInfo = formatter.observationInfo
        self.assertEqual(instrument.getName(), obsInfo.instrument)

        # Add Instrument, Detector, and PhysicalFilter entries to the
        # Butler Registry.
        instrument.register(butler.registry)

        # Define a DatasetType for the cameraGeom.Camera, which can be
        # accessed just by identifying its Instrument.
        # A real-world Camera DatasetType should be identified by a
        # validity range as well.
        cameraDatasetType = DatasetType(
            "camera",
            dimensions=["instrument"],
            storageClass=scFactory.getStorageClass("Camera"),
            universe=butler.registry.dimensions)
        butler.registry.registerDatasetType(cameraDatasetType)

        # Define a DatasetType for cameraGeom.Detectors, which can be
        # accessed by identifying its Instrument and (Butler) Detector.
        # A real-world Detector DatasetType probably doesn't need to exist,
        # as  it would just duplicate information in the Camera, and
        # reading a full Camera just to get a single Detector should be
        # plenty efficient.
        detectorDatasetType = DatasetType(
            "detector",
            dimensions=["instrument", "detector"],
            storageClass=scFactory.getStorageClass("Detector"),
            universe=butler.registry.dimensions)
        butler.registry.registerDatasetType(detectorDatasetType)

        # Put and get the Camera.
        dataId = dict(instrument=instrument.instrument)
        butler.put(instrument.getCamera(), "camera", dataId=dataId)
        camera = butler.get("camera", dataId)
        # Full camera comparisons are *slow*; just compare names.
        self.assertEqual(instrument.getCamera().getName(), camera.getName())

        # Put and get a random subset of the Detectors.
        allDetectors = list(instrument.getCamera())
        numDetectors = min(3, len(allDetectors))
        someDetectors = [
            allDetectors[i] for i in self.rng.choice(
                len(allDetectors), size=numDetectors, replace=False)
        ]
        for cameraGeomDetector in someDetectors:
            # Right now we only support integer detector IDs in data IDs;
            # support for detector names and groups (i.e. rafts) is
            # definitely planned but not yet implemented.
            dataId = dict(instrument=instrument.instrument,
                          detector=cameraGeomDetector.getId())
            butler.put(cameraGeomDetector, "detector", dataId=dataId)
            cameraGeomDetector2 = butler.get("detector", dataId=dataId)
            # Full detector comparisons are *slow*; just compare names and
            # serials.
            self.assertEqual(cameraGeomDetector.getName(),
                             cameraGeomDetector2.getName())
            self.assertEqual(cameraGeomDetector.getSerial(),
                             cameraGeomDetector2.getSerial())