Esempio n. 1
0
    def _read_artifact_into_memory(self,
                                   getInfo: DatastoreFileGetInformation,
                                   ref: DatasetRef,
                                   isComponent: bool = False) -> Any:
        location = getInfo.location

        log.debug("Downloading data from %s", location.uri)
        serializedDataset = location.uri.read()

        storedFileInfo = getInfo.info
        if len(serializedDataset) != storedFileInfo.file_size:
            raise RuntimeError(
                "Integrity failure in Datastore. "
                f"Size of file {location.path} ({len(serializedDataset)}) "
                f"does not match recorded size of {storedFileInfo.file_size}")

        # format the downloaded bytes into appropriate object directly, or via
        # tempfile (when formatter does not support to/from/Bytes). This is
        # equivalent of PosixDatastore formatter.read try-except block.
        formatter = getInfo.formatter
        try:
            result = formatter.fromBytes(
                serializedDataset,
                component=getInfo.component if isComponent else None)
        except NotImplementedError:
            # formatter might not always have an extension so mypy complains
            # We can either ignore the complaint or use a temporary location
            tmpLoc = Location(".", "temp")
            tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
            with tempfile.NamedTemporaryFile(
                    suffix=tmpLoc.getExtension()) as tmpFile:
                tmpFile.write(serializedDataset)
                # Flush the write. Do not close the file because that
                # will delete it.
                tmpFile.flush()
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                result = formatter.read(
                    component=getInfo.component if isComponent else None)
        except Exception as e:
            raise ValueError(
                f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
                f" ({ref.datasetType.name} from {location.uri}): {e}") from e

        return self._post_process_get(result,
                                      getInfo.readStorageClass,
                                      getInfo.assemblerParams,
                                      isComponent=isComponent)
Esempio n. 2
0
    def _read_artifact_into_memory(self,
                                   getInfo: DatastoreFileGetInformation,
                                   ref: DatasetRef,
                                   isComponent: bool = False) -> Any:
        location = getInfo.location

        # since we have to make a GET request to S3 anyhow (for download) we
        # might as well use the HEADER metadata for size comparison instead.
        # s3CheckFileExists would just duplicate GET/LIST charges in this case.
        try:
            log.debug("Reading file: %s", location.uri)
            response = self.client.get_object(Bucket=location.netloc,
                                              Key=location.relativeToPathRoot)
            log.debug("Successfully read file: %s", location.uri)
        except self.client.exceptions.ClientError as err:
            errorcode = err.response["ResponseMetadata"]["HTTPStatusCode"]
            # head_object returns 404 when object does not exist only when user
            # has s3:ListBucket permission. If list permission does not exist a
            # 403 is returned. In practical terms this usually means that the
            # file does not exist, but it could also mean user lacks GetObject
            # permission. It's hard to tell which case is it.
            # docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectHEAD.html
            # Unit tests right now demand FileExistsError is raised, but this
            # should be updated to PermissionError like in s3CheckFileExists.
            if errorcode == 403:
                raise FileNotFoundError(
                    f"Dataset with Id {ref.id} not accessible at "
                    f"expected location {location}. Forbidden HEAD "
                    "operation error occured. Verify s3:ListBucket "
                    "and s3:GetObject permissions are granted for "
                    "your IAM user and that file exists. ") from err
            if errorcode == 404:
                errmsg = f"Dataset with Id {ref.id} does not exists at expected location {location}."
                raise FileNotFoundError(errmsg) from err
            # other errors are reraised also, but less descriptively
            raise err

        storedFileInfo = getInfo.info
        if response["ContentLength"] != storedFileInfo.file_size:
            raise RuntimeError(
                "Integrity failure in Datastore. Size of file {} ({}) does not"
                " match recorded size of {}".format(location.path,
                                                    response["ContentLength"],
                                                    storedFileInfo.file_size))

        # download the data as bytes
        serializedDataset = response["Body"].read()

        # format the downloaded bytes into appropriate object directly, or via
        # tempfile (when formatter does not support to/from/Bytes). This is S3
        # equivalent of PosixDatastore formatter.read try-except block.
        formatter = getInfo.formatter
        try:
            result = formatter.fromBytes(
                serializedDataset,
                component=getInfo.component if isComponent else None)
        except NotImplementedError:
            # formatter might not always have an extension so mypy complains
            # We can either ignore the complaint or use a temporary location
            tmpLoc = Location(".", "temp")
            tmpLoc = formatter.makeUpdatedLocation(tmpLoc)
            with tempfile.NamedTemporaryFile(
                    suffix=tmpLoc.getExtension()) as tmpFile:
                tmpFile.write(serializedDataset)
                # Flush the write. Do not close the file because that
                # will delete it.
                tmpFile.flush()
                formatter._fileDescriptor.location = Location(
                    *os.path.split(tmpFile.name))
                result = formatter.read(
                    component=getInfo.component if isComponent else None)
        except Exception as e:
            raise ValueError(
                f"Failure from formatter '{formatter.name()}' for dataset {ref.id}"
                f" ({ref.datasetType.name} from {location.uri}): {e}") from e

        return self._post_process_get(result,
                                      getInfo.readStorageClass,
                                      getInfo.assemblerParams,
                                      isComponent=isComponent)