def test_serialize_docker_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert sorted(config.labels) == sorted(config2.labels) fields = ["env", "image", "ports"] for field in fields: assert getattr(config, field) == getattr(config2, field)
def test_serialize_local_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert sorted(config.labels) == sorted(config2.labels) fields = ["env", "working_dir"] for field in fields: assert getattr(config, field) == getattr(config2, field)
def _get_run_config(self, flow_run: GraphQLResult, run_config_cls: Type[RunConfig]) -> RunConfig: """ Get a run_config for the flow, if present. The returned run config is always of type `run_config_cls` Args: - flow_run (GraphQLResult): A GraphQLResult flow run object - run_config_cls (Callable): The expected run-config class Returns: - RunConfig: The flow run's run-config or an instance of `run_config_cls` """ # If the flow is using a run_config, load it if getattr(flow_run, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.run_config) if isinstance(run_config, UniversalRun): # Convert to agent-specific run-config return run_config_cls(env=run_config.env, labels=run_config.labels) elif not isinstance(run_config, run_config_cls): msg = ( "Flow run %s has a `run_config` of type `%s`, only `%s` is supported" % (flow_run.id, type(run_config).__name__, run_config_cls.__name__)) self.logger.error(msg) raise TypeError(msg) return run_config # Otherwise, return the default run_config return run_config_cls()
def _from_flow_data(cls, flow_data: dict, **kwargs: Any) -> "FlowView": """ Instantiate a `FlowView` from serialized data This method deserializes objects into their Prefect types. Args: - flow_data: The dict of serialized data - **kwargs: Additional kwargs are passed to __init__ and overrides attributes from `flow_data` """ flow_data = flow_data.copy() flow_id = flow_data.pop("id") flow_group_data = flow_data.pop("flow_group") flow_group_labels = flow_group_data["labels"] project_name = flow_data.pop("project")["name"] storage = StorageSchema().load(flow_data.pop("storage")) run_config = RunConfigSchema().load(flow_data.pop("run_config")) # Combine the data from `flow_data` with `kwargs` flow_args = { **dict( flow_id=flow_id, project_name=project_name, storage=storage, flow_group_labels=flow_group_labels, run_config=run_config, **flow_data, ), **kwargs, } return cls(**flow_args)
def test_serialize_ecs_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert sorted(config.labels) == sorted(config2.labels) fields = [ "task_definition", "task_definition_path", "image", "env", "cpu", "memory", "task_role_arn", "run_task_kwargs", ] for field in fields: assert getattr(config, field) == getattr(config2, field)
def deploy_flow(self, flow_run: GraphQLResult) -> str: """ Deploy a flow run as an ECS task. Args: - flow_run (GraphQLResult): A GraphQLResult flow run object Returns: - str: Information about the deployment """ self.logger.info("Deploying flow run %r", flow_run.id) # Load and validate the flow's run_config if getattr(flow_run.flow, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.flow.run_config) if not isinstance(run_config, ECSRun): self.logger.error( "Flow run %s has a `run_config` of type `%s`, only `ECSRun` is supported", flow_run.id, type(run_config).__name__, ) raise TypeError("Unsupported RunConfig type: %s" % type(run_config).__name__) else: self.logger.error( "Flow run %s has a null `run_config`, only `ECSRun` is supported", flow_run.id, ) raise ValueError("Flow is missing a `run_config`") taskdef_arn = self.get_task_definition_arn(flow_run, run_config) if taskdef_arn is None: # Register a new task definition self.logger.debug("Registering new task definition for flow %s", flow_run.flow.id) taskdef = self.generate_task_definition(flow_run, run_config) resp = self.ecs_client.register_task_definition(**taskdef) taskdef_arn = resp["taskDefinition"]["taskDefinitionArn"] self.logger.debug( "Registered task definition %s for flow %s", taskdef_arn, flow_run.flow.id, ) else: self.logger.debug("Using task definition %s for flow %s", taskdef_arn, flow_run.flow.id) # Get kwargs to pass to run_task kwargs = self.get_run_task_kwargs(flow_run, run_config) resp = self.ecs_client.run_task(taskDefinition=taskdef_arn, **kwargs) if resp.get("tasks"): task_arn = resp["tasks"][0]["taskArn"] self.logger.debug("Started task %r for flow run %r", task_arn, flow_run.id) return f"Task {task_arn}" raise ValueError( "Failed to start task for flow run {0}. Failures: {1}".format( flow_run.id, resp.get("failures")))
def _get_run_config( self, flow_run: GraphQLResult, run_config_cls: Type[RunConfig] ) -> Optional[RunConfig]: """ Get a run_config for the flow, if present. Args: - flow_run (GraphQLResult): A GraphQLResult flow run object - run_config_cls (Callable): The expected run-config class Returns: - RunConfig: The flow run's run-config. Returns None if an environment-based flow. """ # If the flow is using a run_config, load it if getattr(flow_run, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.run_config) if isinstance(run_config, UniversalRun): # Convert to agent-specific run-config return run_config_cls(env=run_config.env, labels=run_config.labels) elif not isinstance(run_config, run_config_cls): msg = ( "Flow run %s has a `run_config` of type `%s`, only `%s` is supported" % (flow_run.id, type(run_config).__name__, run_config_cls.__name__) ) self.logger.error(msg) raise TypeError(msg) return run_config elif getattr(flow_run.flow, "environment", None) is None: # No environment, use default run_config return run_config_cls() return None
def test_serialize_kubernetes_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert sorted(config.labels) == sorted(config2.labels) fields = [ "job_template", "job_template_path", "image", "env", "cpu_limit", "cpu_request", "memory_limit", "memory_request", ] for field in fields: assert getattr(config, field) == getattr(config2, field)
def get_flow_image(flow_run: GraphQLResult, default: str = None) -> str: """ Retrieve the image to use for this flow run deployment. Args: - flow_run (GraphQLResult): A GraphQLResult flow run object - default (str, optional): A default image to use. If not specified, The `prefecthq/prefect` image corresponding with the flow's prefect version will be used. Returns: - str: a full image name to use for this flow run Raises: - ValueError: if deployment attempted on unsupported Storage type and `image` not present in environment metadata """ from prefect.storage import Docker from prefect.serialization.storage import StorageSchema from prefect.serialization.run_config import RunConfigSchema from prefect.serialization.environment import EnvironmentSchema has_run_config = getattr(flow_run, "run_config", None) is not None has_environment = getattr(flow_run.flow, "environment", None) is not None storage = StorageSchema().load(flow_run.flow.storage) # Not having an environment implies run-config based flow, even if # run_config is None. if has_run_config or not has_environment: # Precedence: # - Image on docker storage # - Image on run_config # - Provided default # - `prefecthq/prefect` for flow's core version if isinstance(storage, Docker): return storage.name if has_run_config: run_config = RunConfigSchema().load(flow_run.run_config) if getattr(run_config, "image", None) is not None: return run_config.image if default is not None: return default # core_version should always be present, but just in case version = flow_run.flow.get("core_version") or "latest" cleaned_version = version.split("+")[0] return f"prefecthq/prefect:{cleaned_version}" else: environment = EnvironmentSchema().load(flow_run.flow.environment) if hasattr(environment, "metadata") and hasattr( environment.metadata, "image"): return environment.metadata.get("image") elif isinstance(storage, Docker): return storage.name raise ValueError( f"Storage for flow run {flow_run.id} is not of type Docker and " f"environment has no `image` attribute in the metadata field.")
def load_active_run_config(): client = Client() query = { "query": { with_args("flow_run_by_pk", {"id": prefect.context.flow_run_id}): { "run_config": True } } } blob = client.graphql(query).data.flow_run_by_pk.run_config return RunConfigSchema().load(blob)
def test_serialize_kubernetes_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert sorted(config.labels) == sorted(config2.labels) fields = [ "job_template", "job_template_path", "image", "env", "cpu_limit", "cpu_request", "memory_limit", "memory_request", <<<<<<< HEAD "service_account_name", "image_pull_secrets", ======= >>>>>>> prefect clone ] for field in fields: assert getattr(config, field) == getattr(config2, field)
def _from_flow_run_data( cls, flow_run_data: dict, task_runs: Iterable["TaskRunView"] = None) -> "FlowRunView": """ Instantiate a `TaskRunView` from serialized data. This method deserializes objects into their Prefect types. Exists to maintain consistency in the design of backend "View" classes. Args: - flow_run_data: A dict of flow run data - task_runs: An optional iterable of task runs to pre-populate the cache with Returns: A populated `FlowRunView` instance """ flow_run_data = flow_run_data.copy() # Avoid mutating the input object flow_run_id = flow_run_data.pop("id") serialized_state = flow_run_data.pop("serialized_state") state = ( State.deserialize(serialized_state) if serialized_state # Flow run may not have initialized its state yet else Pending( message="A state for this flow run is not yet available.")) run_config_data = flow_run_data.pop("run_config") run_config = (RunConfigSchema().load(run_config_data) if run_config_data else None) states_data = flow_run_data.pop("states", []) states = list( sorted( [ _TimestampedState.from_dict(state_data) for state_data in states_data ], key=lambda s: s.timestamp, )) updated_at = cast(pendulum.DateTime, pendulum.parse(flow_run_data.pop("updated"))) return cls( flow_run_id=flow_run_id, task_runs=task_runs, state=state, updated_at=updated_at, states=states, run_config=run_config, **flow_run_data, )
def build_flow_run(self, config, storage=None): if storage is None: storage = Local() return GraphQLResult({ "flow": GraphQLResult({ "storage": storage.serialize(), "run_config": RunConfigSchema().dump(config), "id": "new_id", "core_version": "0.13.0", }), "id": "id", })
def get_flow_image(flow_run: GraphQLResult) -> str: """ Retrieve the image to use for this flow run deployment. Args: - flow_run (GraphQLResult): A GraphQLResult flow run object Returns: - str: a full image name to use for this flow run Raises: - ValueError: if deployment attempted on unsupported Storage type and `image` not present in environment metadata """ from prefect.environments.storage import Docker from prefect.serialization.storage import StorageSchema from prefect.serialization.run_config import RunConfigSchema from prefect.serialization.environment import EnvironmentSchema has_run_config = getattr(flow_run.flow, "run_config", None) is not None has_environment = getattr(flow_run.flow, "environment", None) is not None storage = StorageSchema().load(flow_run.flow.storage) # Not having an environment implies run-config based flow, even if # run_config is None. if has_run_config or not has_environment: if isinstance(storage, Docker): return storage.name elif has_run_config: run_config = RunConfigSchema().load(flow_run.flow.run_config) if getattr(run_config, "image", None) is not None: return run_config.image # No image found on run-config, and no environment present. Use default. # core_version should always be present, but just in case version = flow_run.flow.get("core_version") or "latest" cleaned_version = version.split("+")[0] return f"prefecthq/prefect:all_extras-{cleaned_version}" else: environment = EnvironmentSchema().load(flow_run.flow.environment) if hasattr(environment, "metadata") and hasattr(environment.metadata, "image"): return environment.metadata.get("image") elif isinstance(storage, Docker): return storage.name raise ValueError( f"Storage for flow run {flow_run.id} is not of type Docker and " f"environment has no `image` attribute in the metadata field." )
""" self.logger.info("Deploying flow run {}".format(flow_run.id)) # type: ignore # 'import docker' is expensive time-wise, we should do this just-in-time to keep # the 'import prefect' time low import docker <<<<<<< HEAD run_config = self._get_run_config(flow_run, DockerRun) assert run_config is None or isinstance(run_config, DockerRun) # mypy image = get_flow_image(flow_run=flow_run) env_vars = self.populate_env_vars(flow_run, image, run_config=run_config) ======= if getattr(flow_run.flow, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.flow.run_config) if not isinstance(run_config, DockerRun): self.logger.error( "Flow run %s has a `run_config` of type `%s`, only `DockerRun` is supported", flow_run.id, type(run_config).__name__, ) raise TypeError( "Unsupported RunConfig type: %s" % type(run_config).__name__ ) else: run_config = None image = get_flow_image(flow_run=flow_run) env_vars = self.populate_env_vars(flow_run, run_config=run_config) >>>>>>> prefect clone
def test_serialize_universal_run(config): msg = RunConfigSchema().dump(config) config2 = RunConfigSchema().load(msg) assert (config.env) == config2.env assert sorted(config.labels) == sorted(config2.labels)
def generate_job_spec_from_run_config(self, flow_run: GraphQLResult) -> dict: """Generate a k8s job spec for a flow run. Args: - flow_run (GraphQLResult): A flow run object Returns: - dict: a dictionary representation of a k8s job for flow execution """ run_config = RunConfigSchema().load(flow_run.flow.run_config) if run_config.job_template: job = run_config.job_template else: job_template_path = run_config.job_template_path or self.job_template_path self.logger.debug("Loading job template from %r", job_template_path) template_bytes = read_bytes_from_path(job_template_path) job = yaml.safe_load(template_bytes) identifier = uuid.uuid4().hex[:8] job_name = f"prefect-job-{identifier}" # Populate job metadata for identification k8s_labels = { "prefect.io/identifier": identifier, "prefect.io/flow_run_id": flow_run.id, # type: ignore "prefect.io/flow_id": flow_run.flow.id, # type: ignore } _get_or_create(job, "metadata.labels") _get_or_create(job, "spec.template.metadata.labels") job["metadata"]["name"] = job_name job["metadata"]["labels"].update(**k8s_labels) job["spec"]["template"]["metadata"]["labels"].update(**k8s_labels) # Get the first container, which is used for the prefect job containers = _get_or_create(job, "spec.template.spec.containers", []) if not containers: containers.append({}) container = containers[0] # Set container image container["image"] = image = get_flow_image(flow_run) # Set flow run command container["args"] = [get_flow_run_command(flow_run)] # Populate environment variables from the following sources, # with precedence: # - Values required for flow execution, hardcoded below # - Values set on the KubernetesRun object # - Values set using the `--env` CLI flag on the agent # - Values in the job template env = self.env_vars.copy() if run_config.env: env.update(run_config.env) env.update({ "PREFECT__CLOUD__API": config.cloud.api, "PREFECT__CLOUD__AUTH_TOKEN": config.cloud.agent.auth_token, "PREFECT__CLOUD__USE_LOCAL_SECRETS": "false", "PREFECT__CONTEXT__FLOW_RUN_ID": flow_run.id, "PREFECT__CONTEXT__FLOW_ID": flow_run.flow.id, "PREFECT__CONTEXT__IMAGE": image, "PREFECT__LOGGING__LOG_TO_CLOUD": str(self.log_to_cloud).lower(), "PREFECT__ENGINE__FLOW_RUNNER__DEFAULT_CLASS": "prefect.engine.cloud.CloudFlowRunner", "PREFECT__ENGINE__TASK_RUNNER__DEFAULT_CLASS": "prefect.engine.cloud.CloudTaskRunner", }) container_env = [{"name": k, "value": v} for k, v in env.items()] for entry in container.get("env", []): if entry["name"] not in env: container_env.append(entry) container["env"] = container_env # Set resource requirements if provided _get_or_create(container, "resources.requests") _get_or_create(container, "resources.limits") resources = container["resources"] if run_config.memory_request: resources["requests"]["memory"] = run_config.memory_request if run_config.memory_limit: resources["limits"]["memory"] = run_config.memory_limit if run_config.cpu_request: resources["requests"]["cpu"] = run_config.cpu_request if run_config.cpu_limit: resources["limits"]["cpu"] = run_config.cpu_limit return job
def deploy_flow(self, flow_run: GraphQLResult) -> str: """ Deploy flow runs on your local machine as Docker containers Args: - flow_run (GraphQLResult): A GraphQLResult flow run object Returns: - str: Information about the deployment Raises: - ValueError: if deployment attempted on unsupported Storage type """ self.logger.info("Deploying flow run {}".format( flow_run.id)) # type: ignore storage = StorageSchema().load(flow_run.flow.storage) if isinstance(storage, Docker): self.logger.error( "Flow run %s has an unsupported storage type: `%s`", flow_run.id, type(storage).__name__, ) raise TypeError("Unsupported Storage type: %s" % type(storage).__name__) # If the flow is using a run_config, load it if getattr(flow_run.flow, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.flow.run_config) if not isinstance(run_config, LocalRun): self.logger.error( "Flow run %s has a `run_config` of type `%s`, only `LocalRun` is supported", flow_run.id, type(run_config).__name__, ) raise TypeError("Unsupported RunConfig type: %s" % type(run_config).__name__) else: run_config = None env = self.populate_env_vars(flow_run, run_config=run_config) working_dir = None if run_config is None else run_config.working_dir if working_dir and not os.path.exists(working_dir): msg = f"Flow run {flow_run.id} has a nonexistent `working_dir` configured: {working_dir}" self.logger.error(msg) raise ValueError(msg) stdout = sys.stdout if self.show_flow_logs else DEVNULL # note: we will allow these processes to be orphaned if the agent were to exit # before the flow runs have completed. The lifecycle of the agent should not # dictate the lifecycle of the flow run. However, if the user has elected to # show flow logs, these log entries will continue to stream to the users terminal # until these child processes exit, even if the agent has already exited. p = Popen( get_flow_run_command(flow_run).split(" "), stdout=stdout, stderr=STDOUT, env=env, cwd=working_dir, ) self.processes.add(p) self.logger.debug("Submitted flow run {} to process PID {}".format( flow_run.id, p.pid)) return "PID: {}".format(p.pid)
def deploy_flow(self, flow_run: GraphQLResult) -> str: """ Deploy flow runs on your local machine as Docker containers Args: - flow_run (GraphQLResult): A GraphQLResult flow run object Returns: - str: Information about the deployment """ self.logger.info("Deploying flow run {}".format( flow_run.id)) # type: ignore # 'import docker' is expensive time-wise, we should do this just-in-time to keep # the 'import prefect' time low import docker if getattr(flow_run.flow, "run_config", None) is not None: run_config = RunConfigSchema().load(flow_run.flow.run_config) if not isinstance(run_config, DockerRun): self.logger.error( "Flow run %s has a `run_config` of type `%s`, only `DockerRun` is supported", flow_run.id, type(run_config).__name__, ) raise TypeError("Unsupported RunConfig type: %s" % type(run_config).__name__) else: run_config = None image = get_flow_image(flow_run=flow_run) env_vars = self.populate_env_vars(flow_run, run_config=run_config) if not self.no_pull and len(image.split("/")) > 1: self.logger.info("Pulling image {}...".format(image)) registry = image.split("/")[0] if self.reg_allow_list and registry not in self.reg_allow_list: self.logger.error( "Trying to pull image from a Docker registry '{}' which" " is not in the reg_allow_list".format(registry)) raise ValueError( "Trying to pull image from a Docker registry '{}' which" " is not in the reg_allow_list".format(registry)) else: pull_output = self.docker_client.pull(image, stream=True, decode=True) for line in pull_output: self.logger.debug(line) self.logger.info( "Successfully pulled image {}...".format(image)) # Create any named volumes (if they do not already exist) for named_volume_name in self.named_volumes: try: self.docker_client.inspect_volume(name=named_volume_name) except docker.errors.APIError: self.logger.debug( "Creating named volume {}".format(named_volume_name)) self.docker_client.create_volume( name=named_volume_name, driver="local", labels={"prefect_created": "true"}, ) # Create a container self.logger.debug("Creating Docker container {}".format(image)) host_config = {"auto_remove": True} # type: dict container_mount_paths = self.container_mount_paths if container_mount_paths: host_config.update(binds=self.host_spec) if sys.platform.startswith("linux") and self.docker_interface: docker_internal_ip = get_docker_ip() host_config.update( extra_hosts={"host.docker.internal": docker_internal_ip}) networking_config = None if self.network: networking_config = self.docker_client.create_networking_config( {self.network: self.docker_client.create_endpoint_config()}) container = self.docker_client.create_container( image, command=get_flow_run_command(flow_run), environment=env_vars, volumes=container_mount_paths, host_config=self.docker_client.create_host_config(**host_config), networking_config=networking_config, ) # Start the container self.logger.debug("Starting Docker container with ID {}".format( container.get("Id"))) if self.network: self.logger.debug("Adding container to docker network: {}".format( self.network)) self.docker_client.start(container=container.get("Id")) if self.show_flow_logs: self.stream_flow_logs(container.get("Id")) self.logger.debug("Docker container {} started".format( container.get("Id"))) return "Container ID: {}".format(container.get("Id"))