Exemple #1
0
    def gcs_bucket(self) -> "google.cloud.storage.bucket.Bucket":
        if not hasattr(self, "_gcs_bucket"):
            from prefect.utilities.gcp import get_storage_client

            client = get_storage_client()
            self.gcs_bucket = client.bucket(self.bucket)
        return self._gcs_bucket
Exemple #2
0
    def run(
        self,
        bucket_name: str = None,
        blob: str = None,
        project: str = None,
        wait_seconds: int = 0,
        fail_if_not_found: bool = True,
        credentials: dict = None,
        request_timeout: Union[float, Tuple[float, float]] = 60,
    ) -> str:
        """
        Run method for this Task. Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - bucket_name (str, optional): the bucket to check
            - blob (str, optional): object for which to search within the bucket
            - project (str, optional): default Google Cloud project to work within.
                If not provided, will be inferred from your Google Cloud credentials
            - wait_seconds(int, optional): retry until file is found or until wait_seconds,
                whichever is first.  Defaults to 0
            - fail_if_not_found (bool, optional):  Will raise Fail signal on task if
                blob is not found.  Defaults to True
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - request_timeout (Union[float, Tuple[float, float]], optional): the number of
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).

        Returns:
            - bool: the object exists

        Raises:
            - ValueError: if `bucket_name` or `blob` are missing
            - FAIL: if object not found and fail_if_not_found is True

        """
        if None in [bucket_name, blob]:
            raise ValueError("Missing bucket_name or blob")

        # create client
        client = get_storage_client(project=project, credentials=credentials)

        bucket = client.bucket(bucket_name)
        blob_exists = None

        wait, n = 0, 1
        while wait <= wait_seconds and not blob_exists:
            sleep(n)
            wait += n
            n *= 2
            blob_exists = storage.Blob(bucket=bucket, name=blob).exists(client)
        if fail_if_not_found and not blob_exists:
            raise FAIL(message="Blob not found")
        return blob_exists
Exemple #3
0
def read_bytes_from_path(path: str) -> bytes:
    """Read bytes from a given path.

    Paths may be local files, or remote files (given a supported file scheme).

    Args:
        - path (str): The file path

    Returns:
        - bytes: The file contents
    """
    parsed = parse_path(path)
    if not parsed.scheme or parsed.scheme in ("file", "agent"):
        with open(parsed.path, "rb") as f:
            return f.read()
    elif parsed.scheme == "gcs":
        from prefect.utilities.gcp import get_storage_client

        client = get_storage_client()
        bucket = client.bucket(parsed.netloc)
        blob = bucket.get_blob(parsed.path.lstrip("/"))
        if blob is None:
            raise ValueError(f"Job template doesn't exist at {path}")
        return blob.download_as_bytes()
    elif parsed.scheme == "s3":
        from prefect.utilities.aws import get_boto_client

        client = get_boto_client(resource="s3")
        stream = io.BytesIO()
        client.download_fileobj(Bucket=parsed.netloc,
                                Key=parsed.path,
                                Fileobj=stream)
        return stream.getvalue()
    else:
        raise ValueError(f"Unsupported file scheme {path}")
Exemple #4
0
    def run(
        self,
        bucket: str = None,
        blob: str = None,
        project: str = None,
        credentials: dict = None,
        encryption_key: str = None,
        encryption_key_secret: str = None,
        request_timeout: Union[float, Tuple[float, float]] = 60,
    ) -> str:
        """
        Run method for this Task.  Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be provided
        _either_ at initialization _or_ as arguments.

        Args:
            - bucket (str, optional): the bucket name to upload to
            - blob (str, optional): blob name to download from
            - project (str, optional): Google Cloud project to work within. If not provided
                here or at initialization, will be inferred from your Google Cloud credentials
            - credentials (dict, optional): a JSON document containing Google Cloud
                credentials.  You should provide these at runtime with an upstream Secret task.
                If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and
                lastly will use default Google client logic.
            - encryption_key (str, optional): an encryption key
            - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret
                storing an optional `encryption_key` to be used when uploading the Blob
            - request_timeout (Union[float, Tuple[float, float]], optional): the number of
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).

        Returns:
            - str: the data from the blob, as a string

        Raises:
            - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket
                name is not found
            - ValueError: if `blob` name hasn't been provided

        """
        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # retrieve bucket
        bucket = self._retrieve_bucket(client=client,
                                       bucket=bucket,
                                       create_bucket=False)

        # identify blob name
        gcs_blob = self._get_blob(
            bucket,
            blob,
            encryption_key=encryption_key,
            encryption_key_secret=encryption_key_secret,
        )
        data = gcs_blob.download_as_string(timeout=request_timeout)
        return data
Exemple #5
0
    def run(
        self,
        data: str,
        bucket: str = None,
        blob: str = None,
        project: str = None,
        credentials: dict = None,
        encryption_key: str = None,
        create_bucket: bool = False,
        encryption_key_secret: str = None,
    ) -> str:
        """
        Run method for this Task.  Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - data (str): the data to upload; must already be represented as a string
            - bucket (str, optional): the bucket name to upload to
            - blob (str, optional): blob name to upload to
                a string beginning with `prefect-` and containing the Task Run ID will be used
            - project (str, optional): Google Cloud project to work within. Can be inferred
                from credentials if not provided.
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - encryption_key (str, optional): an encryption key
            - create_bucket (bool, optional): boolean specifying whether to create the bucket
                if it does not exist, otherwise an Exception is raised. Defaults to `False`.
            - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret
                storing an optional `encryption_key` to be used when uploading the Blob.

        Raises:
            - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is
                not found

        Returns:
            - str: the blob name that now stores the provided data
        """
        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # retrieve bucket
        bucket = self._retrieve_bucket(client=client,
                                       bucket=bucket,
                                       create_bucket=create_bucket)

        # identify blob name
        gcs_blob = self._get_blob(
            bucket,
            blob,
            encryption_key=encryption_key,
            encryption_key_secret=encryption_key_secret,
        )
        gcs_blob.upload_from_string(data)
        return gcs_blob.name
Exemple #6
0
    def run(
        self,
        source_bucket: str = None,
        source_blob: str = None,
        dest_bucket: str = None,
        dest_blob: str = None,
        project: str = None,
        credentials: dict = None,
    ) -> str:
        """
        Run method for this Task. Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - source_bucket (str, optional): default source bucket name.
            - source_blob (str, optional): default source blob name.
            - dest_bucket (str, optional): default destination bucket name.
            - dest_blob (str, optional): default destination blob name.
            - project (str, optional): default Google Cloud project to work within.
                If not provided, will be inferred from your Google Cloud credentials
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.

        Returns:
            - str: the name of the destination blob

        Raises:
            - ValueError: if `source_bucket`, `source_blob`, `dest_bucket`, or `dest_blob`
                are missing or point at the same object.

        """
        if None in [source_bucket, source_blob, dest_bucket, dest_blob]:
            raise ValueError("Missing source or destination")
        elif (source_bucket, source_blob) == (dest_bucket, dest_blob):
            raise ValueError("Source and destination are identical.")

        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # get source bucket and blob
        source_bucket_obj = client.get_bucket(source_bucket)
        source_blob_obj = source_bucket_obj.blob(source_blob)
        # get dest bucket
        dest_bucket_obj = client.get_bucket(dest_bucket)
        # copy from source blob to dest bucket
        source_bucket_obj.copy_blob(blob=source_blob_obj,
                                    destination_bucket=dest_bucket_obj,
                                    new_name=dest_blob)

        return dest_blob
Exemple #7
0
    def gcs_bucket(self) -> "google.cloud.storage.bucket.Bucket":
        if not hasattr(self, "_gcs_bucket"):
            from prefect.utilities.gcp import get_storage_client

            if self.credentials_secret:
                credentials = Secret(self.credentials_secret).get()
            else:
                credentials = None
            client = get_storage_client(credentials=credentials)
            self.gcs_bucket = client.bucket(self.bucket)
        return self._gcs_bucket
Exemple #8
0
    def initialize_client(self) -> None:
        """
        Initializes GCS connections.
        """
        from prefect.utilities.gcp import get_storage_client

        if self.credentials_secret:
            credentials = Secret(self.credentials_secret).get()
        else:
            credentials = None
        client = get_storage_client(credentials=credentials)
        self.gcs_bucket = client.bucket(self.bucket)
Exemple #9
0
 def _get_client(self,
                 project: str,
                 credentials: dict,
                 credentials_secret: str = None):
     """
     Creates and returns a GCS Client instance
     """
     credentials = None
     if credentials_secret is not None:
         warnings.warn(
             "The `credentials_secret` argument is deprecated. Use a `Secret` task "
             "to pass the credentials value at runtime instead.",
             UserWarning,
         )
         credentials = Secret(credentials_secret).get()
     return get_storage_client(credentials=credentials, project=project)
Exemple #10
0
def cleanup_gcs_files(gcp_credentials: dict, url: str, project: str):
    """
    Task to delete files from a GCS prefix.

    Arguments:
      gcp_credentials (dict): GCP credentials in a format required by prefect.utilities.gcp.get_storage_client.
      url (str): Pointer to a GCS prefix containing one or more objects to delete.
      project (str): Name of the project which contains the target objects.
    """
    gcs_client = get_storage_client(credentials=gcp_credentials, project=project)
    parsed_url = urlparse(url)
    bucket = gcs_client.get_bucket(parsed_url.netloc)
    prefix = parsed_url.path.lstrip("/")
    # The list function is needed because bucket.list_blobs returns an
    # HTTPIterator object, which does not implement __len__.
    # But bucket.delete_blobs expects an Iterable with __len__.
    blobs = list(bucket.list_blobs(prefix=prefix))
    bucket.delete_blobs(blobs)
    return blobs
def read_bytes_from_path(path: str) -> bytes:
    """Read bytes from a given path.

    Paths may be local files, or remote files (given a supported file scheme).

    Args:
        - path (str): The file path

    Returns:
        - bytes: The file contents
    """
    parsed = parse_path(path)
    if not parsed.scheme or parsed.scheme in ("file", "agent"):
        with open(parsed.path, "rb") as f:
            return f.read()
    elif parsed.scheme == "gcs":
        from prefect.utilities.gcp import get_storage_client

        client = get_storage_client()
        bucket = client.bucket(parsed.netloc)
        blob = bucket.get_blob(parsed.path.lstrip("/"))
        if blob is None:
            raise ValueError(f"Job template doesn't exist at {path}")
Exemple #12
0
    def _gcs_client(self):  # type: ignore
        from prefect.utilities.gcp import get_storage_client

        client = get_storage_client(project=self.project)
        return client
Exemple #13
0
    def run(
        self,
        data: Union[str, bytes],
        bucket: str = None,
        blob: str = None,
        project: str = None,
        chunk_size: int = None,
        credentials: dict = None,
        encryption_key: str = None,
        create_bucket: bool = False,
        encryption_key_secret: str = None,
        content_type: str = None,
        content_encoding: str = None,
        request_timeout: Union[float, Tuple[float, float]] = 60,
    ) -> str:
        """
        Run method for this Task.  Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - data (Union[str, bytes]): the data to upload; can be either string or bytes
            - bucket (str, optional): the bucket name to upload to
            - blob (str, optional): blob name to upload to
                a string beginning with `prefect-` and containing the Task Run ID will be used
            - project (str, optional): Google Cloud project to work within. Can be inferred
                from credentials if not provided.
            - chunk_size (int, optional): The size of a chunk of data whenever iterating (in bytes).
                This must be a multiple of 256 KB per the API specification.
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - encryption_key (str, optional): an encryption key
            - create_bucket (bool, optional): boolean specifying whether to create the bucket
                if it does not exist, otherwise an Exception is raised. Defaults to `False`.
            - encryption_key_secret (str, optional, DEPRECATED): the name of the Prefect Secret
                storing an optional `encryption_key` to be used when uploading the Blob.
            - content_type (str, optional): HTTP ‘Content-Type’ header for this object.
            - content_encoding (str, optional): HTTP ‘Content-Encoding’ header for this object.
            - request_timeout (Union[float, Tuple[float, float]], optional): the number of
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).

        Raises:
            - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket name is
                not found

        Returns:
            - str: the blob name that now stores the provided data
        """
        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # retrieve bucket
        bucket = self._retrieve_bucket(client=client,
                                       bucket=bucket,
                                       create_bucket=create_bucket)

        # identify blob name
        gcs_blob = self._get_blob(
            bucket,
            blob,
            chunk_size=chunk_size,
            encryption_key=encryption_key,
            encryption_key_secret=encryption_key_secret,
        )

        # Upload
        if type(data) == str:
            gcs_blob.upload_from_string(data, timeout=request_timeout)
        elif type(data) == bytes:
            # Set content type and encoding if supplied.
            # This is likely only desirable if uploading gzip data:
            # https://cloud.google.com/storage/docs/metadata#content-encoding
            if content_type:
                gcs_blob.content_type = content_type
            if content_encoding:
                gcs_blob.content_encoding = content_encoding
            gcs_blob.upload_from_file(io.BytesIO(data),
                                      timeout=request_timeout)
        return gcs_blob.name
Exemple #14
0
def upload_to_gcs(filenames):
    storage_client = get_storage_client()
    bucket = storage_client.get_bucket(settings.CLOUD_BUCKET)
    for filename in filenames:
        blob = bucket.blob(filename)
        blob.upload_from_filename(filename)
Exemple #15
0
    def run(
        self,
        source_bucket: str = None,
        source_blob: str = None,
        dest_bucket: str = None,
        dest_blob: str = None,
        project: str = None,
        credentials: dict = None,
        create_bucket: bool = False,
        request_timeout: Union[float, Tuple[float, float]] = 60,
    ) -> str:
        """
        Run method for this Task. Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - source_bucket (str, optional): default source bucket name.
            - source_blob (str, optional): default source blob name.
            - dest_bucket (str, optional): default destination bucket name.
            - dest_blob (str, optional): default destination blob name.
            - project (str, optional): default Google Cloud project to work within.
                If not provided, will be inferred from your Google Cloud credentials
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - create_bucket (bool, optional): boolean specifying whether to create the dest_bucket
                if it does not exist, otherwise an Exception is raised. Defaults to `False`.
            - request_timeout (Union[float, Tuple[float, float]], optional): the number of
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).

        Returns:
            - str: the name of the destination blob

        Raises:
            - ValueError: if `source_bucket`, `source_blob`, `dest_bucket`, or `dest_blob`
                are missing or point at the same object.

        """
        if None in [source_bucket, source_blob, dest_bucket, dest_blob]:
            raise ValueError("Missing source or destination")
        elif (source_bucket, source_blob) == (dest_bucket, dest_blob):
            raise ValueError("Source and destination are identical.")

        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # get source bucket and blob
        source_bucket_obj = client.get_bucket(source_bucket)
        source_blob_obj = source_bucket_obj.blob(source_blob)
        # get dest bucket
        dest_bucket_obj = self._retrieve_bucket(client=client,
                                                bucket=dest_bucket,
                                                create_bucket=create_bucket)
        # copy from source blob to dest bucket
        source_bucket_obj.copy_blob(
            blob=source_blob_obj,
            destination_bucket=dest_bucket_obj,
            new_name=dest_blob,
            timeout=request_timeout,
        )

        return dest_blob
Exemple #16
0
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).
=======
>>>>>>> prefect clone

        Returns:
            - str: the data from the blob, as a string

        Raises:
            - google.cloud.exception.NotFound: if `create_bucket=False` and the bucket
                name is not found
            - ValueError: if `blob` name hasn't been provided

        """
        # create client
        client = get_storage_client(project=project, credentials=credentials)

        # retrieve bucket
        bucket = self._retrieve_bucket(
            client=client, bucket=bucket, create_bucket=False
        )

        # identify blob name
<<<<<<< HEAD
        blob = self._get_blob(
            bucket,
            blob,
            chunk_size=chunk_size,
            encryption_key=encryption_key,
            encryption_key_secret=encryption_key_secret,
        )