def test_flow_from_bytes_warns_prefect_version_mismatch(self, monkeypatch): s = flow_to_bytes_pickle(Flow("test")) monkeypatch.setattr(prefect, "__version__", "0.1.0") with pytest.warns(UserWarning, match="This flow was built using Prefect"): flow = flow_from_bytes_pickle(s) assert isinstance(flow, Flow) assert flow.name == "test"
def test_cloudpickle_deserialization_check_passes_and_returns_multiple_objs(self): flow_one = flow_to_bytes_pickle(Flow("one")) flow_two = flow_to_bytes_pickle(Flow("two")) with tempfile.TemporaryDirectory() as tmpdir: file_one = os.path.join(tmpdir, "one.flow") with open(file_one, "wb") as f: f.write(flow_one) file_two = os.path.join(tmpdir, "two.flow") with open(file_two, "wb") as f: f.write(flow_two) paths = ["{}".format(file_one), "{}".format(file_two)] objs = healthchecks.cloudpickle_deserialization_check(paths) assert len(objs) == 2
def build(self) -> "Storage": """ Build the Azure storage object by uploading Flows to an Azure Blob container. This will upload all of the flows found in `storage.flows`. Returns: - Storage: an Azure object that contains information about how and where each flow is stored """ self.run_basic_healthchecks() if self.stored_as_script: if not self.blob_name: raise ValueError( "A `blob_name` must be provided to show where flow `.py` file is stored in Azure." ) return self for flow_name, flow in self._flows.items(): data = flow_to_bytes_pickle(flow) client = self._azure_block_blob_service.get_blob_client( container=self.container, blob=self.flows[flow_name] ) self.logger.info( "Uploading {} to {}".format(self.flows[flow_name], self.container) ) client.upload_blob(data, overwrite=self.overwrite) return self
def test_upload_multiple_flows_to_gcs(self, google_client): blob_mock = MagicMock() bucket_mock = MagicMock(blob=MagicMock(return_value=blob_mock)) google_client.return_value.get_bucket = MagicMock( return_value=bucket_mock) storage = GCS(bucket="awesome-bucket") flows = (Flow("awesome-flow-1"), Flow("awesome-flow-2")) for f in flows: storage.add_flow(f) assert storage.build() assert bucket_mock.blob.call_count == 2 assert blob_mock.upload_from_string.call_count == 2 expected_blob_calls = [] expected_upload_calls = [] for f in flows: expected_blob_calls.append(call(blob_name=storage.flows[f.name])) expected_upload_calls.append(call(flow_to_bytes_pickle(f))) # note, we don't upload until build() is called, which iterates on a dictionary, which is not ordered older versions of python bucket_mock.blob.assert_has_calls(expected_blob_calls, any_order=True) blob_mock.upload_from_string.assert_has_calls(expected_upload_calls, any_order=True)
def test_cloudpickle_deserialization_check_passes_and_returns_objs(self): good_bytes = flow_to_bytes_pickle(Flow("empty")) with tempfile.NamedTemporaryFile() as f: f.write(good_bytes) f.seek(0) objs = healthchecks.cloudpickle_deserialization_check(["{}".format(f.name)]) assert len(objs) == 1 flow = objs.pop() assert isinstance(flow, Flow) assert flow.name == "empty" assert flow.tasks == set()
def test_upload_flow_to_azure(monkeypatch): client = MagicMock(upload_blob=MagicMock()) service = MagicMock(get_blob_client=MagicMock(return_value=client)) monkeypatch.setattr("prefect.storage.Azure._azure_block_blob_service", service) storage = Azure(container="container") f = Flow("test") assert f.name not in storage assert storage.add_flow(f) assert storage.build() client.upload_blob.assert_called_once_with(flow_to_bytes_pickle(f), overwrite=False) assert f.name in storage
def build(self) -> "Storage": """ Build the GCS storage object by uploading Flows to an GCS bucket. This will upload all of the flows found in `storage.flows`. Returns: - Storage: an GCS object that contains information about how and where each flow is stored """ self.run_basic_healthchecks() if self.stored_as_script: if self.local_script_path: for flow_name, flow in self._flows.items(): self.logger.info( "Uploading script {} to {} in {}".format( self.local_script_path, self.flows[flow.name], self.bucket ) ) bucket = self._gcs_client.get_bucket(self.bucket) blob = bucket.blob(blob_name=self.flows[flow_name]) with open(self.local_script_path) as file_obj: blob.upload_from_file(file_obj) else: if not self.key: raise ValueError( "A `key` must be provided to show where flow `.py` file is stored in GCS or " "provide a `local_script_path` pointing to a local script that contains the " "flow." ) return self for flow_name, flow in self._flows.items(): content = flow_to_bytes_pickle(flow) bucket = self._gcs_client.get_bucket(self.bucket) blob = bucket.blob(blob_name=self.flows[flow_name]) self.logger.info( "Uploading {} to {}".format(self.flows[flow_name], self.bucket) ) blob.upload_from_string(content) return self
def test_upload_single_flow_with_custom_key_to_gcs(self, google_client): blob_mock = MagicMock() bucket_mock = MagicMock(blob=MagicMock(return_value=blob_mock)) google_client.return_value.get_bucket = MagicMock( return_value=bucket_mock) storage = GCS(bucket="awesome-bucket", key="the-best-key") f = Flow("awesome-flow") assert f.name not in storage assert storage.add_flow(f) assert f.name in storage assert storage.build() bucket_mock.blob.assert_called_with(blob_name="the-best-key") blob_mock.upload_from_string.assert_called_with( flow_to_bytes_pickle(f))
def test_get_flow_from_gcs(self, google_client): f = Flow("awesome-flow") flow_content = flow_to_bytes_pickle(f) blob_mock = MagicMock(download_as_bytes=MagicMock( return_value=flow_content)) bucket_mock = MagicMock(get_blob=MagicMock(return_value=blob_mock)) google_client.return_value.get_bucket = MagicMock( return_value=bucket_mock) storage = GCS(bucket="awesome-bucket", key="a-place") storage.add_flow(f) fetched_flow = storage.get_flow(f.name) assert fetched_flow.name == f.name bucket_mock.get_blob.assert_called_with("a-place") assert blob_mock.download_as_bytes.call_count == 1
def add_flow(self, flow: "Flow") -> str: """ Method for storing a new flow as bytes in the local filesytem. Args: - flow (Flow): a Prefect Flow to add Returns: - str: the location of the newly added flow in this Storage object Raises: - ValueError: if a flow with the same name is already contained in this storage """ if flow.name in self: raise ValueError( 'Name conflict: Flow with the name "{}" is already present in this storage.'.format( flow.name ) ) if self.stored_as_script: if not self.path: raise ValueError( "A `path` must be provided to show where flow `.py` file is stored." ) flow_location = self.path else: if self.path: flow_location = self.path else: flow_location = os.path.join( self.directory, slugify(flow.name), slugify(pendulum.now("utc").isoformat()), ) os.makedirs(os.path.dirname(flow_location), exist_ok=True) with open(flow_location, "wb") as f: f.write(flow_to_bytes_pickle(flow)) self.flows[flow.name] = flow_location self._flows[flow.name] = flow return flow_location
def test_flow_from_bytes_error(self, monkeypatch, version_mismatch, import_error): exc = ImportError("mymodule") if import_error else ValueError("Oh no!") flow = Flow("test", tasks=[RaiseOnLoad(exc)]) s = flow_to_bytes_pickle(flow) if version_mismatch: monkeypatch.setattr(prefect, "__version__", "0.0.1") monkeypatch.setattr(cloudpickle, "__version__", "0.0.2") with pytest.raises(FlowStorageError, match="An error occurred while unpickling") as exc: flow_from_bytes_pickle(s) msg = "mymodule" if import_error else "Oh no!" assert msg in str(exc.value) # Extra components only present if relevant assert ("missing Python module" in str(exc.value)) == import_error assert ("version mismatches" in str(exc.value)) == version_mismatch
def test_get_flow_s3_pickle(s3_client, key): f = Flow("test") storage = S3(bucket="bucket", key=key) key_used = storage.add_flow(f) data = flow_to_bytes_pickle(f) s3_client.get_object.return_value = { "ETag": "my-etag", "LastModified": datetime.datetime.now(), "Body": io.BytesIO(data), } f2 = storage.get_flow(f.name) assert s3_client.get_object.called assert s3_client.get_object.call_args[1]["Bucket"] == "bucket" assert s3_client.get_object.call_args[1]["Key"] == key_used assert f2.name == f.name state = f2.run() assert state.is_successful()
def test_get_flow_s3_logs(s3_client, caplog, version_id): f = Flow("test") storage = S3(bucket="mybucket") key_used = storage.add_flow(f) data = flow_to_bytes_pickle(f) etag = "my-etag" t = datetime.datetime.now() s3_client.get_object.return_value = { "ETag": etag, "LastModified": t, "Body": io.BytesIO(data), } if version_id: s3_client.get_object.return_value["VersionId"] = version_id storage.get_flow(f.name) logs = [l.getMessage() for l in caplog.records] assert f"s3://mybucket/{key_used}" in logs[0] assert ( f"ETag: {etag}, LastModified: {t.isoformat()}, VersionId: {version_id}" in logs[1])
def create_dockerfile_object(self, directory: str) -> str: """ Writes a dockerfile to the provided directory using the specified arguments on this Docker storage object. In order for the docker python library to build a container it needs a Dockerfile that it can use to define the container. This function takes the specified arguments then writes them to a temporary file called Dockerfile. *Note*: if `files` are added to this container, they will be copied to this directory as well. Args: - directory (str): A directory where the Dockerfile will be created Returns: - str: the absolute file path to the Dockerfile """ # Either load the base commands from the specified dockerfile or define a FROM if self.dockerfile: with open(self.dockerfile, "r") as contents: base_commands = contents.read() else: base_commands = "FROM {base_image}".format(base_image=self.base_image) # Generate ENV variables to load into the image env_vars = "" if self.env_vars: # Format with repr to get proper quoting formatted_vars = [f"{k}={v!r}" for k, v in self.env_vars.items()] env_vars = "ENV " + " \\\n ".join(formatted_vars) # Generate single pip install command for python dependencies pip_installs = "RUN pip install " if self.python_dependencies: for dependency in self.python_dependencies: pip_installs += "{} ".format(dependency) # Write all install-time commands that should be run in the image installation_commands = "" for cmd in self.installation_commands: installation_commands += "RUN {}\n".format(cmd) # Copy user specified files into the image copy_files = "" if self.files: for src, dest in self.files.items(): fname = os.path.basename(src) full_fname = os.path.join(directory, fname) if os.path.exists(full_fname) and filecmp.cmp(src, full_fname) is False: raise ValueError( "File {fname} already exists in {directory}".format( fname=full_fname, directory=directory ) ) else: if os.path.isdir(src): shutil.copytree( src=src, dst=full_fname, symlinks=False, ignore=None ) else: shutil.copy2(src=src, dst=full_fname) copy_files += "COPY {fname} {dest}\n".format( fname=full_fname.replace("\\", "/") if self.dockerfile else fname, dest=dest, ) # Write all flows to file and load into the image copy_flows = "" if not self.stored_as_script: for flow_name, flow_location in self.flows.items(): clean_name = slugify(flow_name) flow_path = os.path.join(directory, "{}.flow".format(clean_name)) with open(flow_path, "wb") as f: f.write(flow_to_bytes_pickle(self._flows[flow_name])) copy_flows += "COPY {source} {dest}\n".format( source=( flow_path.replace("\\", "/") if self.dockerfile else "{}.flow".format(clean_name) ), dest=flow_location, ) else: if not self.path: raise ValueError( "A `path` must be provided to show where flow `.py` file is stored in the image." ) # Write final extra user commands that should be run in the image final_commands = ( "" if self.extra_dockerfile_commands is None else "\n".join(self.extra_dockerfile_commands) ) # Write a healthcheck script into the image with open( os.path.join(os.path.dirname(__file__), "_healthcheck.py"), "r" ) as healthscript: healthcheck = healthscript.read() healthcheck_loc = os.path.join(directory, "healthcheck.py") with open(healthcheck_loc, "w") as health_file: health_file.write(healthcheck) # Escape the healthcheck location healthcheck_loc = ( healthcheck_loc.replace("\\", "/") if self.dockerfile else "healthcheck.py" ) # Generate the command to run the healthcheck healthcheck_run = "" if not self.ignore_healthchecks: flow_file_paths = ", ".join(['"{}"'.format(k) for k in self.flows.values()]) python_version = (sys.version_info.major, sys.version_info.minor) healthcheck_run = ( f"RUN python {self.prefect_directory}/healthcheck.py " f"'[{flow_file_paths}]' '{python_version}'" ) file_contents = textwrap.dedent( f""" {multiline_indent(base_commands, 12)} {multiline_indent(env_vars, 12)} RUN pip install pip --upgrade {multiline_indent(installation_commands, 12)} {pip_installs} RUN mkdir -p {self.prefect_directory}/ {multiline_indent(copy_flows, 12)} COPY {healthcheck_loc} {self.prefect_directory}/healthcheck.py {multiline_indent(copy_files, 12)} {multiline_indent(final_commands, 12)} {healthcheck_run} """ ) dockerfile_path = os.path.join(directory, "Dockerfile") with open(dockerfile_path, "w+") as dockerfile: dockerfile.write(file_contents) return dockerfile_path
def build(self) -> "Storage": """ Build the S3 storage object by uploading Flows to an S3 bucket. This will upload all of the flows found in `storage.flows`. If there is an issue uploading to the S3 bucket an error will be logged. Returns: - Storage: an S3 object that contains information about how and where each flow is stored Raises: - botocore.ClientError: if there is an issue uploading a Flow to S3 """ self.run_basic_healthchecks() from botocore.exceptions import ClientError if self.stored_as_script: if self.local_script_path: for flow_name, flow in self._flows.items(): self.logger.info("Uploading script {} to {} in {}".format( self.local_script_path, self.flows[flow.name], self.bucket)) try: self._boto3_client.upload_file(self.local_script_path, self.bucket, self.flows[flow_name]) except ClientError as err: self.logger.error( "Error uploading Flow script to S3 bucket {}: {}". format(self.bucket, err)) raise err else: if not self.key: raise ValueError( "A `key` must be provided to show where flow `.py` file is stored in S3 or " "provide a `local_script_path` pointing to a local script that contains the " "flow.") return self for flow_name, flow in self._flows.items(): # Pickle Flow data = flow_to_bytes_pickle(flow) # Write pickled Flow to stream stream = io.BytesIO(data) self.logger.info("Uploading {} to {}".format( self.flows[flow_name], self.bucket)) try: self._boto3_client.upload_fileobj(stream, Bucket=self.bucket, Key=self.flows[flow_name]) except ClientError as err: self.logger.error( "Error uploading Flow to S3 bucket {}: {}".format( self.bucket, err)) raise err return self
def test_serialize_deserialize(self): s = flow_to_bytes_pickle(Flow("test")) assert isinstance(s, bytes) flow = flow_from_bytes_pickle(s) assert isinstance(flow, Flow) assert flow.name == "test"
def build(self) -> "Webhook": """ Build the Webhook storage object by issuing an HTTP request to store the flow. If `self.stored_as_script` is `True`, this method will read in the contents of `self.flow_script_path`, convert it to byes, and attach it to the request as `data`. The response from this request is stored in `._build_responses`, a dictionary keyed by flow name. If you are using a service where all the details necessary to fetch a flow cannot be known until you've stored it, you can do something like the following. ```python import cloudpickle import json import os import random import requests from prefect import task, Task, Flow from prefect.storage import Webhook @task def random_number(): return random.randint(0, 100) with Flow("test-flow") as flow: random_number() flow.storage = Webhook( build_request_kwargs={ "url": "some-service/upload", "headers": {"Content-Type": "application/octet-stream"}, }, build_request_http_method="POST", get_flow_request_kwargs={ "url": "some-service/download", "headers": {"Accept": "application/octet-stream"}, }, get_flow_request_http_method="GET", ) flow.storage.add_flow(flow) res = flow.storage.build() # get the ID from the response flow_id = res._build_responses[flow.name].json()["id"] # update storage flow.storage.get_flow_request_kwargs["url"] = f"{GET_ROUTE}/{flow_id}" ``` Returns: - Storage: a Webhook storage object Raises: - requests.exceptions.HTTPError if pushing the flow fails """ self.run_basic_healthchecks() self._build_responses = {} for flow_name, flow in self._flows.items(): self.logger.info("Uploading flow '{}'".format(flow_name)) data = flow_to_bytes_pickle(flow) if self.stored_as_script: # these checks are here in build() instead of the constructor # so that serialization and deserialization of flows doesnot fail if not self.flow_script_path: msg = "flow_script_path must be provided if stored_as_script=True" self.logger.critical(msg) raise RuntimeError(msg) if not os.path.isfile(self.flow_script_path): msg = "file '{}' passed to flow_script_path does not exist".format( self.flow_script_path) self.logger.critical(msg) raise RuntimeError(msg) with open(self.flow_script_path, "r") as f: data = f.read().encode("utf-8") req_function = self._method_to_function[ self.build_request_http_method] build_request_kwargs = _render_dict(self.build_request_kwargs) if "data" in build_request_kwargs.keys(): msg = ("'data' found in build_request_kwargs. This value is " "overwritten with the flow content and should not " "be set directly") self.logger.warning(msg) warnings.warn(msg, RuntimeWarning, stacklevel=2) build_request_kwargs["data"] = data response = req_function(**build_request_kwargs) # type: ignore response.raise_for_status() self._build_responses[flow_name] = response self.logger.info( "Successfully uploaded flow '{}'".format(flow_name)) return self