def gcs_bucket(self) -> "google.cloud.storage.bucket.Bucket": if not hasattr(self, "_gcs_bucket"): from prefect.utilities.gcp import get_storage_client client = get_storage_client() self.gcs_bucket = client.bucket(self.bucket) return self._gcs_bucket
def run( self, bucket_name: str = None, blob: str = None, project: str = None, wait_seconds: int = 0, fail_if_not_found: bool = True, credentials: dict = None, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - bucket_name (str, optional): the bucket to check - blob (str, optional): object for which to search within the bucket - project (str, optional): default Google Cloud project to work within. If not provided, will be inferred from your Google Cloud credentials - wait_seconds(int, optional): retry until file is found or until wait_seconds, whichever is first. Defaults to 0 - fail_if_not_found (bool, optional): Will raise Fail signal on task if blob is not found. Defaults to True - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Returns: - bool: the object exists Raises: - ValueError: if `bucket_name` or `blob` are missing - FAIL: if object not found and fail_if_not_found is True """ if None in [bucket_name, blob]: raise ValueError("Missing bucket_name or blob") # create client client = get_storage_client(project=project, credentials=credentials) bucket = client.bucket(bucket_name) blob_exists = None wait, n = 0, 1 while wait <= wait_seconds and not blob_exists: sleep(n) wait += n n *= 2 blob_exists = storage.Blob(bucket=bucket, name=blob).exists(client) if fail_if_not_found and not blob_exists: raise FAIL(message="Blob not found") return blob_exists
def read_bytes_from_path(path: str) -> bytes: """Read bytes from a given path. Paths may be local files, or remote files (given a supported file scheme). Args: - path (str): The file path Returns: - bytes: The file contents """ parsed = parse_path(path) if not parsed.scheme or parsed.scheme in ("file", "agent"): with open(parsed.path, "rb") as f: return f.read() elif parsed.scheme == "gcs": from prefect.utilities.gcp import get_storage_client client = get_storage_client() bucket = client.bucket(parsed.netloc) blob = bucket.get_blob(parsed.path.lstrip("/")) if blob is None: raise ValueError(f"Job template doesn't exist at {path}") return blob.download_as_bytes() elif parsed.scheme == "s3": from prefect.utilities.aws import get_boto_client client = get_boto_client(resource="s3") stream = io.BytesIO() client.download_fileobj(Bucket=parsed.netloc, Key=parsed.path, Fileobj=stream) return stream.getvalue() else: raise ValueError(f"Unsupported file scheme {path}")
def run( self, bucket: str = None, blob: str = None, project: str = None, credentials: dict = None, encryption_key: str = None, encryption_key_secret: str = None, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - bucket (str, optional): the bucket name to upload to - blob (str, optional): blob name to download from - project (str, optional): Google Cloud project to work within. If not provided here or at initialization, will be inferred from your Google Cloud credentials - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - encryption_key (str, optional): an encryption key - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret storing an optional `encryption_key` to be used when uploading the Blob - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Returns: - str: the data from the blob, as a string Raises: - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is not found - ValueError: if `blob` name hasn't been provided """ # create client client = get_storage_client(project=project, credentials=credentials) # retrieve bucket bucket = self._retrieve_bucket(client=client, bucket=bucket, create_bucket=False) # identify blob name gcs_blob = self._get_blob( bucket, blob, encryption_key=encryption_key, encryption_key_secret=encryption_key_secret, ) data = gcs_blob.download_as_string(timeout=request_timeout) return data
def run( self, data: str, bucket: str = None, blob: str = None, project: str = None, credentials: dict = None, encryption_key: str = None, create_bucket: bool = False, encryption_key_secret: str = None, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - data (str): the data to upload; must already be represented as a string - bucket (str, optional): the bucket name to upload to - blob (str, optional): blob name to upload to a string beginning with `prefect-` and containing the Task Run ID will be used - project (str, optional): Google Cloud project to work within. Can be inferred from credentials if not provided. - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - encryption_key (str, optional): an encryption key - create_bucket (bool, optional): boolean specifying whether to create the bucket if it does not exist, otherwise an Exception is raised. Defaults to `False`. - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret storing an optional `encryption_key` to be used when uploading the Blob. Raises: - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is not found Returns: - str: the blob name that now stores the provided data """ # create client client = get_storage_client(project=project, credentials=credentials) # retrieve bucket bucket = self._retrieve_bucket(client=client, bucket=bucket, create_bucket=create_bucket) # identify blob name gcs_blob = self._get_blob( bucket, blob, encryption_key=encryption_key, encryption_key_secret=encryption_key_secret, ) gcs_blob.upload_from_string(data) return gcs_blob.name
def run( self, source_bucket: str = None, source_blob: str = None, dest_bucket: str = None, dest_blob: str = None, project: str = None, credentials: dict = None, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - source_bucket (str, optional): default source bucket name. - source_blob (str, optional): default source blob name. - dest_bucket (str, optional): default destination bucket name. - dest_blob (str, optional): default destination blob name. - project (str, optional): default Google Cloud project to work within. If not provided, will be inferred from your Google Cloud credentials - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. Returns: - str: the name of the destination blob Raises: - ValueError: if `source_bucket`, `source_blob`, `dest_bucket`, or `dest_blob` are missing or point at the same object. """ if None in [source_bucket, source_blob, dest_bucket, dest_blob]: raise ValueError("Missing source or destination") elif (source_bucket, source_blob) == (dest_bucket, dest_blob): raise ValueError("Source and destination are identical.") # create client client = get_storage_client(project=project, credentials=credentials) # get source bucket and blob source_bucket_obj = client.get_bucket(source_bucket) source_blob_obj = source_bucket_obj.blob(source_blob) # get dest bucket dest_bucket_obj = client.get_bucket(dest_bucket) # copy from source blob to dest bucket source_bucket_obj.copy_blob(blob=source_blob_obj, destination_bucket=dest_bucket_obj, new_name=dest_blob) return dest_blob
def gcs_bucket(self) -> "google.cloud.storage.bucket.Bucket": if not hasattr(self, "_gcs_bucket"): from prefect.utilities.gcp import get_storage_client if self.credentials_secret: credentials = Secret(self.credentials_secret).get() else: credentials = None client = get_storage_client(credentials=credentials) self.gcs_bucket = client.bucket(self.bucket) return self._gcs_bucket
def initialize_client(self) -> None: """ Initializes GCS connections. """ from prefect.utilities.gcp import get_storage_client if self.credentials_secret: credentials = Secret(self.credentials_secret).get() else: credentials = None client = get_storage_client(credentials=credentials) self.gcs_bucket = client.bucket(self.bucket)
def _get_client(self, project: str, credentials: dict, credentials_secret: str = None): """ Creates and returns a GCS Client instance """ credentials = None if credentials_secret is not None: warnings.warn( "The `credentials_secret` argument is deprecated. Use a `Secret` task " "to pass the credentials value at runtime instead.", UserWarning, ) credentials = Secret(credentials_secret).get() return get_storage_client(credentials=credentials, project=project)
def cleanup_gcs_files(gcp_credentials: dict, url: str, project: str): """ Task to delete files from a GCS prefix. Arguments: gcp_credentials (dict): GCP credentials in a format required by prefect.utilities.gcp.get_storage_client. url (str): Pointer to a GCS prefix containing one or more objects to delete. project (str): Name of the project which contains the target objects. """ gcs_client = get_storage_client(credentials=gcp_credentials, project=project) parsed_url = urlparse(url) bucket = gcs_client.get_bucket(parsed_url.netloc) prefix = parsed_url.path.lstrip("/") # The list function is needed because bucket.list_blobs returns an # HTTPIterator object, which does not implement __len__. # But bucket.delete_blobs expects an Iterable with __len__. blobs = list(bucket.list_blobs(prefix=prefix)) bucket.delete_blobs(blobs) return blobs
def read_bytes_from_path(path: str) -> bytes: """Read bytes from a given path. Paths may be local files, or remote files (given a supported file scheme). Args: - path (str): The file path Returns: - bytes: The file contents """ parsed = parse_path(path) if not parsed.scheme or parsed.scheme in ("file", "agent"): with open(parsed.path, "rb") as f: return f.read() elif parsed.scheme == "gcs": from prefect.utilities.gcp import get_storage_client client = get_storage_client() bucket = client.bucket(parsed.netloc) blob = bucket.get_blob(parsed.path.lstrip("/")) if blob is None: raise ValueError(f"Job template doesn't exist at {path}")
def _gcs_client(self): # type: ignore from prefect.utilities.gcp import get_storage_client client = get_storage_client(project=self.project) return client
def run( self, data: Union[str, bytes], bucket: str = None, blob: str = None, project: str = None, chunk_size: int = None, credentials: dict = None, encryption_key: str = None, create_bucket: bool = False, encryption_key_secret: str = None, content_type: str = None, content_encoding: str = None, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - data (Union[str, bytes]): the data to upload; can be either string or bytes - bucket (str, optional): the bucket name to upload to - blob (str, optional): blob name to upload to a string beginning with `prefect-` and containing the Task Run ID will be used - project (str, optional): Google Cloud project to work within. Can be inferred from credentials if not provided. - chunk_size (int, optional): The size of a chunk of data whenever iterating (in bytes). This must be a multiple of 256 KB per the API specification. - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - encryption_key (str, optional): an encryption key - create_bucket (bool, optional): boolean specifying whether to create the bucket if it does not exist, otherwise an Exception is raised. Defaults to `False`. - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret storing an optional `encryption_key` to be used when uploading the Blob. - content_type (str, optional): HTTP ‘Content-Type’ header for this object. - content_encoding (str, optional): HTTP ‘Content-Encoding’ header for this object. - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Raises: - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is not found Returns: - str: the blob name that now stores the provided data """ # create client client = get_storage_client(project=project, credentials=credentials) # retrieve bucket bucket = self._retrieve_bucket(client=client, bucket=bucket, create_bucket=create_bucket) # identify blob name gcs_blob = self._get_blob( bucket, blob, chunk_size=chunk_size, encryption_key=encryption_key, encryption_key_secret=encryption_key_secret, ) # Upload if type(data) == str: gcs_blob.upload_from_string(data, timeout=request_timeout) elif type(data) == bytes: # Set content type and encoding if supplied. # This is likely only desirable if uploading gzip data: # https://cloud.google.com/storage/docs/metadata#content-encoding if content_type: gcs_blob.content_type = content_type if content_encoding: gcs_blob.content_encoding = content_encoding gcs_blob.upload_from_file(io.BytesIO(data), timeout=request_timeout) return gcs_blob.name
def upload_to_gcs(filenames): storage_client = get_storage_client() bucket = storage_client.get_bucket(settings.CLOUD_BUCKET) for filename in filenames: blob = bucket.blob(filename) blob.upload_from_filename(filename)
def run( self, source_bucket: str = None, source_blob: str = None, dest_bucket: str = None, dest_blob: str = None, project: str = None, credentials: dict = None, create_bucket: bool = False, request_timeout: Union[float, Tuple[float, float]] = 60, ) -> str: """ Run method for this Task. Invoked by _calling_ this Task after initialization within a Flow context. Note that some arguments are required for the task to run, and must be provided _either_ at initialization _or_ as arguments. Args: - source_bucket (str, optional): default source bucket name. - source_blob (str, optional): default source blob name. - dest_bucket (str, optional): default destination bucket name. - dest_blob (str, optional): default destination blob name. - project (str, optional): default Google Cloud project to work within. If not provided, will be inferred from your Google Cloud credentials - credentials (dict, optional): a JSON document containing Google Cloud credentials. You should provide these at runtime with an upstream Secret task. If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly will use default Google client logic. - create_bucket (bool, optional): boolean specifying whether to create the dest_bucket if it does not exist, otherwise an Exception is raised. Defaults to `False`. - request_timeout (Union[float, Tuple[float, float]], optional): the number of seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). Returns: - str: the name of the destination blob Raises: - ValueError: if `source_bucket`, `source_blob`, `dest_bucket`, or `dest_blob` are missing or point at the same object. """ if None in [source_bucket, source_blob, dest_bucket, dest_blob]: raise ValueError("Missing source or destination") elif (source_bucket, source_blob) == (dest_bucket, dest_blob): raise ValueError("Source and destination are identical.") # create client client = get_storage_client(project=project, credentials=credentials) # get source bucket and blob source_bucket_obj = client.get_bucket(source_bucket) source_blob_obj = source_bucket_obj.blob(source_blob) # get dest bucket dest_bucket_obj = self._retrieve_bucket(client=client, bucket=dest_bucket, create_bucket=create_bucket) # copy from source blob to dest bucket source_bucket_obj.copy_blob( blob=source_blob_obj, destination_bucket=dest_bucket_obj, new_name=dest_blob, timeout=request_timeout, ) return dest_blob
seconds the transport should wait for the server response. Can also be passed as a tuple (connect_timeout, read_timeout). ======= >>>>>>> prefect clone Returns: - str: the data from the blob, as a string Raises: - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is not found - ValueError: if `blob` name hasn't been provided """ # create client client = get_storage_client(project=project, credentials=credentials) # retrieve bucket bucket = self._retrieve_bucket( client=client, bucket=bucket, create_bucket=False ) # identify blob name <<<<<<< HEAD blob = self._get_blob( bucket, blob, chunk_size=chunk_size, encryption_key=encryption_key, encryption_key_secret=encryption_key_secret, )