Ejemplo n.º 1
0
    def deploy_flows(self, flow_runs: list) -> None:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        for flow_run in flow_runs:

            storage = StorageSchema().load(flow_run.flow.storage)
            if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
                self.logger.error(
                    "Storage for flow run {} is not of type Docker.".format(flow_run.id)
                )
                continue

            env_vars = self.populate_env_vars(flow_run=flow_run)

            if not self.no_pull:
                self.logger.debug("Pulling image {}...".format(storage.name))
                try:
                    self.docker_client.pull(storage.name)
                except docker.errors.APIError:
                    self.logger.error("Issue pulling image {}".format(storage.name))

            # Create a container
            container = self.docker_client.create_container(
                storage.name, command="prefect execute cloud-flow", environment=env_vars
            )

            # Start the container
            self.docker_client.start(container=container.get("Id"))
Ejemplo n.º 2
0
    def deploy_flows(self, flow_runs: list) -> None:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        for flow_run in flow_runs:
            self.logger.info("Deploying flow run {}".format(
                flow_run.id)  # type: ignore
                             )

            storage = StorageSchema().load(flow_run.flow.storage)
            if not isinstance(StorageSchema().load(flow_run.flow.storage),
                              Docker):
                msg = "Storage for flow run {} is not of type Docker.".format(
                    flow_run.id)
                state_msg = "Agent {} failed to run flow: ".format(
                    self.name) + msg
                self.client.set_flow_run_state(flow_run.id,
                                               version=flow_run.version,
                                               state=Failed(state_msg))
                self.logger.error(msg)
                continue

            env_vars = self.populate_env_vars(flow_run=flow_run)

            if not self.no_pull and storage.registry_url:
                self.logger.info("Pulling image {}...".format(storage.name))
                try:
                    pull_output = self.docker_client.pull(storage.name,
                                                          stream=True,
                                                          decode=True)
                    for line in pull_output:
                        self.logger.debug(line)
                    self.logger.info("Successfully pulled image {}...".format(
                        storage.name))
                except docker.errors.APIError as exc:
                    msg = "Issue pulling image {}".format(storage.name)
                    state_msg = (
                        "Agent {} failed to pull image for flow: ".format(
                            self.name) + msg)
                    self.client.set_flow_run_state(flow_run.id,
                                                   version=flow_run.version,
                                                   state=Failed(msg))
                    self.logger.error(msg)

            # Create a container
            self.logger.debug("Creating Docker container {}".format(
                storage.name))
            container = self.docker_client.create_container(
                storage.name,
                command="prefect execute cloud-flow",
                environment=env_vars)

            # Start the container
            self.logger.debug("Starting Docker container with ID {}".format(
                container.get("Id")))
            self.docker_client.start(container=container.get("Id"))
Ejemplo n.º 3
0
    def replace_job_spec_yaml(self, flow_run: GraphQLResult) -> dict:
        """
        Populate metadata and variables in the job_spec.yaml file for flow runs

        Args:
            - flow_run (GraphQLResult): A flow run object

        Returns:
            - dict: a dictionary representing the populated yaml object
        """
        with open(path.join(path.dirname(__file__), "job_spec.yaml"), "r") as job_file:
            job = yaml.safe_load(job_file)

        identifier = str(uuid.uuid4())[:8]
        job_name = "prefect-job-{}".format(identifier)

        # Populate job metadata for identification
        job["metadata"]["name"] = job_name
        job["metadata"]["labels"]["app"] = job_name
        job["metadata"]["labels"]["identifier"] = identifier
        job["metadata"]["labels"]["flow_run_id"] = flow_run.id  # type: ignore
        job["metadata"]["labels"]["flow_id"] = flow_run.flow.id  # type: ignore
        job["spec"]["template"]["metadata"]["labels"]["app"] = job_name
        job["spec"]["template"]["metadata"]["labels"][
            "flow_run_id"
        ] = flow_run.id  # type: ignore
        job["spec"]["template"]["metadata"]["labels"]["identifier"] = identifier

        # Use flow storage image for job
        job["spec"]["template"]["spec"]["containers"][0]["image"] = (
            StorageSchema().load(flow_run.flow.storage).name  # type: ignore
        )

        self.logger.debug(
            "Using image {} for job".format(
                StorageSchema().load(flow_run.flow.storage).name  # type: ignore
            )
        )

        # Populate environment variables for flow run execution
        env = job["spec"]["template"]["spec"]["containers"][0]["env"]

        env[0]["value"] = config.cloud.api or "https://api.prefect.io"
        env[1]["value"] = config.cloud.agent.auth_token
        env[2]["value"] = flow_run.id  # type: ignore
        env[3]["value"] = os.getenv("NAMESPACE", "default")

        # Use image pull secrets if provided
        job["spec"]["template"]["spec"]["imagePullSecrets"][0]["name"] = os.getenv(
            "IMAGE_PULL_SECRETS", ""
        )

        return job
Ejemplo n.º 4
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info(
            "Deploying flow run {}".format(flow_run.id)  # type: ignore
        )

        storage = StorageSchema().load(flow_run.flow.storage)
        if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
            self.logger.error(
                "Storage for flow run {} is not of type Docker.".format(flow_run.id)
            )
            raise ValueError("Unsupported Storage type")

        env_vars = self.populate_env_vars(flow_run=flow_run)

        if not self.no_pull and storage.registry_url:
            self.logger.info("Pulling image {}...".format(storage.name))

            pull_output = self.docker_client.pull(
                storage.name, stream=True, decode=True
            )
            for line in pull_output:
                self.logger.debug(line)
            self.logger.info("Successfully pulled image {}...".format(storage.name))

        # Create a container
        self.logger.debug("Creating Docker container {}".format(storage.name))
        container = self.docker_client.create_container(
            storage.name, command="prefect execute cloud-flow", environment=env_vars
        )

        # Start the container
        self.logger.debug(
            "Starting Docker container with ID {}".format(container.get("Id"))
        )
        self.docker_client.start(container=container.get("Id"))

        self.logger.debug("Docker container {} started".format(container.get("Id")))

        return "Container ID: {}".format(container.get("Id"))
Ejemplo n.º 5
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        storage = StorageSchema().load(flow_run.flow.storage)
        if isinstance(storage, Docker):
            self.logger.error(
                "Flow run %s has an unsupported storage type: `%s`",
                flow_run.id,
                type(storage).__name__,
            )
            raise TypeError("Unsupported Storage type: %s" % type(storage).__name__)

        run_config = self._get_run_config(flow_run, LocalRun)
        env = self.populate_env_vars(flow_run, run_config=run_config)

        working_dir = None if run_config is None else run_config.working_dir
        if working_dir and not os.path.exists(working_dir):
            msg = f"Flow run {flow_run.id} has a nonexistent `working_dir` configured: {working_dir}"
            self.logger.error(msg)
            raise ValueError(msg)

        stdout = sys.stdout if self.show_flow_logs else DEVNULL

        # note: we will allow these processes to be orphaned if the agent were to exit
        # before the flow runs have completed. The lifecycle of the agent should not
        # dictate the lifecycle of the flow run. However, if the user has elected to
        # show flow logs, these log entries will continue to stream to the users terminal
        # until these child processes exit, even if the agent has already exited.
        p = Popen(
            [sys.executable, "-m", "prefect", "execute", "flow-run"],
            stdout=stdout,
            stderr=STDOUT,
            env=env,
            cwd=working_dir,
        )

        self.processes.add(p)
        self.logger.debug(
            "Submitted flow run {} to process PID {}".format(flow_run.id, p.pid)
        )

        if self.block_during_deploy:
            self.logger.debug(f"Waiting flow run {flow_run.id} to complete")
            # We use communicate rather than .wait() to avoid potential hanging
            # of processes. See Popen docs for more info:
            # https://docs.python.org/3/library/subprocess.html#subprocess.Popen.wait
            p.communicate()
            self.logger.debug(f"Flow run {flow_run.id} completed")

        return "PID: {}".format(p.pid)
Ejemplo n.º 6
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(
            flow_run.id)  # type: ignore
                         )

        if not isinstance(
                StorageSchema().load(flow_run.flow.storage),
            (Local, Azure, GCS, S3, GitHub, Webhook),
        ):
            self.logger.error(
                "Storage for flow run {} is not a supported type.".format(
                    flow_run.id))
            raise ValueError("Unsupported Storage type")

        env_vars = self.populate_env_vars(flow_run=flow_run)
        current_env = os.environ.copy()
        current_env.update(env_vars)

        python_path = []
        if current_env.get("PYTHONPATH"):
            python_path.append(current_env.get("PYTHONPATH"))

        python_path.append(os.getcwd())

        if self.import_paths:
            python_path += self.import_paths

        current_env["PYTHONPATH"] = ":".join(python_path)

        stdout = sys.stdout if self.show_flow_logs else DEVNULL

        # note: we will allow these processes to be orphaned if the agent were to exit
        # before the flow runs have completed. The lifecycle of the agent should not
        # dictate the lifecycle of the flow run. However, if the user has elected to
        # show flow logs, these log entries will continue to stream to the users terminal
        # until these child processes exit, even if the agent has already exited.
        p = Popen(
            get_flow_run_command(flow_run).split(" "),
            stdout=stdout,
            stderr=STDOUT,
            env=current_env,
        )

        self.processes.add(p)
        self.logger.debug("Submitted flow run {} to process PID {}".format(
            flow_run.id, p.pid))

        return "PID: {}".format(p.pid)
Ejemplo n.º 7
0
    def replace_job_spec_json(self, flow_run: GraphQLResult) -> dict:
        """
        Populate metadata and variables in the job_spec.nomad file for flow runs

        Args:
            - flow_run (GraphQLResult): A flow run objects

        Returns:
            - dict: a dictionary representing the populated json object
        """
        with open(path.join(path.dirname(__file__), "job_spec.nomad"),
                  "r") as job_file:
            job = json.load(job_file)

        job["Job"]["ID"] = flow_run.id  # type: ignore
        job["Job"]["Name"] = "prefect-job-{}".format(str(uuid.uuid4())[:8])

        job["Job"]["TaskGroups"][0]["Name"] = "prefect-job-{}".format(
            flow_run.id  # type: ignore
        )
        job["Job"]["TaskGroups"][0]["Tasks"][0][
            "Name"] = flow_run.id  # type: ignore

        job["Job"]["TaskGroups"][0]["Tasks"][0]["Config"]["image"] = (
            StorageSchema().load(flow_run.flow.storage).name  # type: ignore
        )

        env = job["Job"]["TaskGroups"][0]["Tasks"][0]["Env"]
        env["PREFECT__CLOUD__API"] = config.cloud.api or "https://api.prefect.io"
        env["PREFECT__CLOUD__AGENT__AUTH_TOKEN"] = config.cloud.agent.auth_token
        env["PREFECT__CONTEXT__FLOW_RUN_ID"] = flow_run.id  # type: ignore
        env["PREFECT__CONTEXT__NAMESPACE"] = os.getenv("NAMESPACE", "default")

        return job
Ejemplo n.º 8
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on to a k8s cluster as jobs

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(
            flow_run.id)  # type: ignore
                         )

        # Require Docker storage
        if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
            self.logger.error(
                "Storage for flow run {} is not of type Docker.".format(
                    flow_run.id))
            raise ValueError("Unsupported Storage type")

        job_spec = self.replace_job_spec_yaml(flow_run)

        self.logger.debug("Creating namespaced job {}".format(
            job_spec["metadata"]["name"]))
        job = self.batch_client.create_namespaced_job(
            namespace=self.namespace or os.getenv("NAMESPACE", "default"),
            body=job_spec)

        self.logger.debug("Job {} created".format(job.metadata.name))

        return "Job {}".format(job.metadata.name)
Ejemplo n.º 9
0
    def deploy_flows(self, flow_runs: list) -> None:
        """
        Deploy flow runs on to a k8s cluster as jobs

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        for flow_run in flow_runs:
            self.logger.debug("Deploying flow run {}".format(
                flow_run.id)  # type: ignore
                              )

            # Require Docker storage
            if not isinstance(StorageSchema().load(flow_run.flow.storage),
                              Docker):
                self.logger.error(
                    "Storage for flow run {} is not of type Docker.".format(
                        flow_run.id))
                continue

            job_spec = self.replace_job_spec_yaml(flow_run)

            self.logger.debug("Creating namespaced job {}".format(
                job_spec["metadata"]["name"]))
            self.batch_client.create_namespaced_job(namespace=os.getenv(
                "NAMESPACE", "default"),
                                                    body=job_spec)
Ejemplo n.º 10
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on to a Nomad cluster as jobs

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
            self.logger.error(
                "Storage for flow run {} is not of type Docker.".format(
                    flow_run.id))
            raise ValueError("Unsupported Storage type")

        # 'import requests' is expensive time-wise, we should do this just-in-time to keep
        # the 'import prefect' time low
        import requests

        job_spec = self.replace_job_spec_json(flow_run)
        nomad_host = os.getenv("NOMAD_HOST", "http://127.0.0.1:4646")
        requests.post(path.join(nomad_host, "v1/jobs"), json=job_spec)

        return "Job ID: {}".format(job_spec["Job"]["ID"])
Ejemplo n.º 11
0
    def deploy_flows(self, flow_runs: list) -> None:
        """
        Deploy flow runs to Fargate

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        for flow_run in flow_runs:
            self.logger.debug("Deploying flow run {}".format(
                flow_run.id)  # type: ignore
                              )

            # Require Docker storage
            if not isinstance(StorageSchema().load(flow_run.flow.storage),
                              Docker):
                self.logger.error(
                    "Storage for flow run {} is not of type Docker.".format(
                        flow_run.id))
                continue

            # check if task definition exists
            self.logger.debug("Checking for task definition")
            if not self._verify_task_definition_exists(flow_run):
                self.logger.debug("No task definition found")
                self._create_task_definition(flow_run)

            # run task
            self._run_task(flow_run)
Ejemplo n.º 12
0
    def _from_flow_data(cls, flow_data: dict, **kwargs: Any) -> "FlowView":
        """
        Instantiate a `FlowView` from serialized data

        This method deserializes objects into their Prefect types.

        Args:
            - flow_data: The dict of serialized data
            - **kwargs: Additional kwargs are passed to __init__ and overrides attributes
                from `flow_data`
        """
        flow_data = flow_data.copy()

        flow_id = flow_data.pop("id")
        flow_group_data = flow_data.pop("flow_group")
        flow_group_labels = flow_group_data["labels"]
        project_name = flow_data.pop("project")["name"]
        storage = StorageSchema().load(flow_data.pop("storage"))
        run_config = RunConfigSchema().load(flow_data.pop("run_config"))

        # Combine the data from `flow_data` with `kwargs`
        flow_args = {
            **dict(
                flow_id=flow_id,
                project_name=project_name,
                storage=storage,
                flow_group_labels=flow_group_labels,
                run_config=run_config,
                **flow_data,
            ),
            **kwargs,
        }

        return cls(**flow_args)
Ejemplo n.º 13
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy a flow run as an ECS task.

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment
        """
        run_config = self._get_run_config(flow_run, ECSRun)
        assert isinstance(run_config, ECSRun)  # mypy

        if run_config.task_definition_arn is None:
            # Register a new task definition
            self.logger.debug("Registering new task definition for flow %s",
                              flow_run.flow.id)
            taskdef = self.generate_task_definition(flow_run, run_config)
            resp = self.ecs_client.register_task_definition(**taskdef)
            taskdef_arn = resp["taskDefinition"]["taskDefinitionArn"]
            new_taskdef_arn = True
            self.logger.debug(
                "Registered task definition %s for flow %s",
                taskdef_arn,
                flow_run.flow.id,
            )
        else:
            from prefect.serialization.storage import StorageSchema
            from prefect.storage import Docker

            if isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
                raise ValueError(
                    "Cannot provide `task_definition_arn` when using `Docker` storage"
                )
            taskdef_arn = run_config.task_definition_arn
            new_taskdef_arn = False
            self.logger.debug("Using task definition %s for flow %s",
                              taskdef_arn, flow_run.flow.id)

        # Get kwargs to pass to run_task
        kwargs = self.get_run_task_kwargs(flow_run, run_config)

        resp = self.ecs_client.run_task(taskDefinition=taskdef_arn, **kwargs)

        # Always deregister the task definition if a new one was registered
        if new_taskdef_arn:
            self.logger.debug("Deregistering task definition %s", taskdef_arn)
            self.ecs_client.deregister_task_definition(
                taskDefinition=taskdef_arn)

        if resp.get("tasks"):
            task_arn = resp["tasks"][0]["taskArn"]
            self.logger.debug("Started task %r for flow run %r", task_arn,
                              flow_run.id)
            return f"Task {task_arn}"

        raise ValueError(
            "Failed to start task for flow run {0}. Failures: {1}".format(
                flow_run.id, resp.get("failures")))
Ejemplo n.º 14
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs to Fargate

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(
            flow_run.id)  # type: ignore
                         )

        # create copies of kwargs to apply overrides as needed
        flow_task_definition_kwargs = copy.deepcopy(
            self.task_definition_kwargs)
        flow_task_run_kwargs = copy.deepcopy(self.task_run_kwargs)

        if self.use_external_kwargs:
            # override from  external kwargs
            self._override_kwargs(flow_run, flow_task_definition_kwargs,
                                  flow_task_run_kwargs)

        # set proper task_definition_name and tags based on enable_task_revisions flag
        if self.enable_task_revisions:
            # set task definition name
            self.task_definition_name = slugify(flow_run.flow.name)
            self._add_flow_tags(flow_run, flow_task_definition_kwargs)

        else:
            self.task_definition_name = "prefect-task-{}".format(  # type: ignore
                flow_run.flow.id[:8]  # type: ignore
            )  # type: ignore

        # Require Docker storage
        if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
            self.logger.error(
                "Storage for flow run {} is not of type Docker.".format(
                    flow_run.id))
            raise ValueError("Unsupported Storage type")

        # check if task definition exists
        self.logger.debug("Checking for task definition")
        if not self._verify_task_definition_exists(flow_run):
            self.logger.debug("No task definition found")
            self._create_task_definition(flow_run, flow_task_definition_kwargs)

        # run task
        task_arn = self._run_task(flow_run, flow_task_run_kwargs)

        self.logger.debug("Run created for task {}".format(task_arn))

        return "Task ARN: {}".format(task_arn)
Ejemplo n.º 15
0
def get_flow_image(flow_run: GraphQLResult, default: str = None) -> str:
    """
    Retrieve the image to use for this flow run deployment.

    Args:
        - flow_run (GraphQLResult): A GraphQLResult flow run object
        - default (str, optional): A default image to use. If not specified,
            The `prefecthq/prefect` image corresponding with the flow's prefect
            version will be used.

    Returns:
        - str: a full image name to use for this flow run

    Raises:
        - ValueError: if deployment attempted on unsupported Storage type and `image` not
            present in environment metadata
    """
    from prefect.storage import Docker
    from prefect.serialization.storage import StorageSchema
    from prefect.serialization.run_config import RunConfigSchema
    from prefect.serialization.environment import EnvironmentSchema

    has_run_config = getattr(flow_run, "run_config", None) is not None
    has_environment = getattr(flow_run.flow, "environment", None) is not None

    storage = StorageSchema().load(flow_run.flow.storage)
    # Not having an environment implies run-config based flow, even if
    # run_config is None.
    if has_run_config or not has_environment:
        # Precedence:
        # - Image on docker storage
        # - Image on run_config
        # - Provided default
        # - `prefecthq/prefect` for flow's core version
        if isinstance(storage, Docker):
            return storage.name
        if has_run_config:
            run_config = RunConfigSchema().load(flow_run.run_config)
            if getattr(run_config, "image", None) is not None:
                return run_config.image
        if default is not None:
            return default
        # core_version should always be present, but just in case
        version = flow_run.flow.get("core_version") or "latest"
        cleaned_version = version.split("+")[0]
        return f"prefecthq/prefect:{cleaned_version}"
    else:
        environment = EnvironmentSchema().load(flow_run.flow.environment)
        if hasattr(environment, "metadata") and hasattr(
                environment.metadata, "image"):
            return environment.metadata.get("image")
        elif isinstance(storage, Docker):
            return storage.name
        raise ValueError(
            f"Storage for flow run {flow_run.id} is not of type Docker and "
            f"environment has no `image` attribute in the metadata field.")
Ejemplo n.º 16
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info(
            "Deploying flow run {}".format(flow_run.id)  # type: ignore
        )

        if not isinstance(
            StorageSchema().load(flow_run.flow.storage), (Local, Azure, GCS, S3)
        ):
            self.logger.error(
                "Storage for flow run {} is not a supported type.".format(flow_run.id)
            )
            raise ValueError("Unsupported Storage type")

        env_vars = self.populate_env_vars(flow_run=flow_run)
        current_env = os.environ.copy()
        current_env.update(env_vars)

        python_path = []
        if current_env.get("PYTHONPATH"):
            python_path.append(current_env.get("PYTHONPATH"))

        python_path.append(os.getcwd())

        if self.import_paths:
            python_path += self.import_paths

        current_env["PYTHONPATH"] = ":".join(python_path)

        stdout = sys.stdout if self.show_flow_logs else PIPE

        p = Popen(
            ["prefect", "execute", "cloud-flow"],
            stdout=stdout,
            stderr=STDOUT,
            env=current_env,
        )

        self.processes.append(p)
        self.logger.debug(
            "Submitted flow run {} to process PID {}".format(flow_run.id, p.pid)
        )

        return "PID: {}".format(p.pid)
Ejemplo n.º 17
0
def test_deprecated_storage_classes(cls_name, args):
    import prefect
    from prefect.serialization.storage import StorageSchema

    cls = getattr(prefect.storage, cls_name)
    old_cls = getattr(prefect.environments.storage, cls_name)
    with pytest.warns(UserWarning, match="deprecated"):
        old_obj = old_cls(*args)

    # Old cls is subclass of new class
    assert isinstance(old_obj, cls)
    # Serialization roundtrips to new class
    new = StorageSchema().load(old_obj.serialize())
    assert type(new) is cls
Ejemplo n.º 18
0
def get_flow_image(flow_run: GraphQLResult) -> str:
    """
    Retrieve the image to use for this flow run deployment.

    Args:
        - flow_run (GraphQLResult): A GraphQLResult flow run object

    Returns:
        - str: a full image name to use for this flow run

    Raises:
        - ValueError: if deployment attempted on unsupported Storage type and `image` not
            present in environment metadata
    """
    from prefect.environments.storage import Docker
    from prefect.serialization.storage import StorageSchema
    from prefect.serialization.run_config import RunConfigSchema
    from prefect.serialization.environment import EnvironmentSchema

    has_run_config = getattr(flow_run.flow, "run_config", None) is not None
    has_environment = getattr(flow_run.flow, "environment", None) is not None

    storage = StorageSchema().load(flow_run.flow.storage)
    # Not having an environment implies run-config based flow, even if
    # run_config is None.
    if has_run_config or not has_environment:
        if isinstance(storage, Docker):
            return storage.name
        elif has_run_config:
            run_config = RunConfigSchema().load(flow_run.flow.run_config)
            if getattr(run_config, "image", None) is not None:
                return run_config.image
        # No image found on run-config, and no environment present. Use default.
        # core_version should always be present, but just in case
        version = flow_run.flow.get("core_version") or "latest"
        cleaned_version = version.split("+")[0]
        return f"prefecthq/prefect:all_extras-{cleaned_version}"
    else:
        environment = EnvironmentSchema().load(flow_run.flow.environment)
        if hasattr(environment, "metadata") and hasattr(environment.metadata, "image"):
            return environment.metadata.get("image")
        elif isinstance(storage, Docker):
            return storage.name
        raise ValueError(
            f"Storage for flow run {flow_run.id} is not of type Docker and "
            f"environment has no `image` attribute in the metadata field."
        )
Ejemplo n.º 19
0
    def deploy_flows(self, flow_runs: list) -> None:
        """
        Deploy flow runs on to a Nomad cluster as jobs

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        for flow_run in flow_runs:

            if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
                self.logger.error(
                    "Storage for flow run {} is not of type Docker.".format(flow_run.id)
                )
                continue

            job_spec = self.replace_job_spec_json(flow_run)
            nomad_host = os.getenv("NOMAD_HOST", "http://127.0.0.1:4646")
            requests.post(path.join(nomad_host, "v1/jobs"), json=job_spec)
Ejemplo n.º 20
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(flow_run.id))  # type: ignore

        storage = StorageSchema().load(flow_run.flow.storage)
        if isinstance(storage, Docker):
            self.logger.error(
                "Flow run %s has an unsupported storage type: `%s`",
                flow_run.id,
                type(storage).__name__,
            )
            raise TypeError("Unsupported Storage type: %s" % type(storage).__name__)
Ejemplo n.º 21
0
    def _create_task_definition(
        self,
        flow_run: GraphQLResult,
        flow_task_definition_kwargs: dict,
        container_definitions_kwargs: dict,
        task_definition_name: str,
    ) -> None:
        """
        Create a task definition for the flow that each flow run will use. This function
        is only called when a flow is run for the first time.

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
            - flow_task_definition_kwargs (dict): kwargs to use for registration
            - container_definitions_kwargs (dict): container definitions kwargs to use for registration
            - task_definition_name (str): task definition name to use
        """
        self.logger.debug("Using image {} for task definition".format(
            StorageSchema().load(flow_run.flow.storage).name  # type: ignore
        ))
        container_definitions = [{
            "name":
            "flow",
            "image":
            StorageSchema().load(flow_run.flow.storage)  # type: ignore
            .name,
            "command": ["/bin/sh", "-c", "prefect execute cloud-flow"],
            "environment": [
                {
                    "name": "PREFECT__CLOUD__API",
                    "value": config.cloud.api or "https://api.prefect.io",
                },
                {
                    "name": "PREFECT__CLOUD__AGENT__LABELS",
                    "value": str(self.labels),
                },
                {
                    "name": "PREFECT__CLOUD__USE_LOCAL_SECRETS",
                    "value": "false"
                },
                {
                    "name": "PREFECT__LOGGING__LOG_TO_CLOUD",
                    "value": str(self.log_to_cloud).lower(),
                },
                {
                    "name": "PREFECT__LOGGING__LEVEL",
                    "value": "DEBUG"
                },
                {
                    "name": "PREFECT__ENGINE__FLOW_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudFlowRunner",
                },
                {
                    "name": "PREFECT__ENGINE__TASK_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudTaskRunner",
                },
            ],
            "secrets": [],
            "mountPoints": [],
            "essential":
            True,
        }]

        for key, value in self.env_vars.items():
            container_definitions[0]["environment"].append(
                dict(name=key, value=value))

        # apply container definitions to "containerDefinitions" key of task definition
        # do not allow override of static envars from Prefect base task definition, which may include self.env_vars

        base_envar_keys = [
            x["name"] for x in container_definitions[0]["environment"]
        ]
        self.logger.debug(
            "Removing static Prefect envars from container_definitions_kwargs if exists"
        )
        container_definitions_environment = [
            x for x in container_definitions_kwargs.get("environment", [])
            if x["name"] not in base_envar_keys
        ]

        container_definitions[0]["environment"].extend(
            container_definitions_environment)
        container_definitions[0]["secrets"] = container_definitions_kwargs.get(
            "secrets", [])
        container_definitions[0][
            "mountPoints"] = container_definitions_kwargs.get(
                "mountPoints", [])

        # Register task definition
        self.logger.debug("Registering task definition {}".format(
            task_definition_name  # type: ignore
        ))
        if self.launch_type:
            flow_task_definition_kwargs["requiresCompatibilities"] = [
                self.launch_type
            ]
        self.boto3_client.register_task_definition(
            family=task_definition_name,  # type: ignore
            containerDefinitions=container_definitions,
            networkMode="awsvpc",
            **flow_task_definition_kwargs)
Ejemplo n.º 22
0
    def _create_task_definition(
        self,
        flow_run: GraphQLResult,
        flow_task_definition_kwargs: dict,
        task_definition_name: str,
    ) -> None:
        """
        Create a task definition for the flow that each flow run will use. This function
        is only called when a flow is run for the first time.

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
            - flow_task_definition_kwargs (dict): kwargs to use for registration
            - task_definition_name (str): task definition name to use
        """
        self.logger.debug("Using image {} for task definition".format(
            StorageSchema().load(flow_run.flow.storage).name  # type: ignore
        ))
        container_definitions = [{
            "name":
            "flow",
            "image":
            StorageSchema().load(flow_run.flow.storage)  # type: ignore
            .name,
            "command": ["/bin/sh", "-c", "prefect execute cloud-flow"],
            "environment": [
                {
                    "name": "PREFECT__CLOUD__API",
                    "value": config.cloud.api or "https://api.prefect.io",
                },
                {
                    "name": "PREFECT__CLOUD__AGENT__LABELS",
                    "value": str(self.labels),
                },
                {
                    "name": "PREFECT__CLOUD__USE_LOCAL_SECRETS",
                    "value": "false"
                },
                {
                    "name": "PREFECT__LOGGING__LOG_TO_CLOUD",
                    "value": str(self.log_to_cloud).lower(),
                },
                {
                    "name": "PREFECT__LOGGING__LEVEL",
                    "value": "DEBUG"
                },
                {
                    "name": "PREFECT__ENGINE__FLOW_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudFlowRunner",
                },
                {
                    "name": "PREFECT__ENGINE__TASK_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudTaskRunner",
                },
            ],
            "essential":
            True,
        }]

        for key, value in self.env_vars.items():
            container_definitions[0]["environment"].append(
                dict(name=key, value=value))

        # Register task definition
        self.logger.debug("Registering task definition {}".format(
            task_definition_name  # type: ignore
        ))
        self.boto3_client.register_task_definition(
            family=task_definition_name,  # type: ignore
            containerDefinitions=container_definitions,
            requiresCompatibilities=["FARGATE"],
            networkMode="awsvpc",
            **flow_task_definition_kwargs)
Ejemplo n.º 23
0
    def _create_task_definition(self, flow_run: GraphQLResult) -> None:
        """
        Create a task definition for the flow that each flow run will use. This function
        is only called when a flow is run for the first time.

        Args:
            - flow_runs (list): A list of GraphQLResult flow run objects
        """
        container_definitions = [{
            "name":
            "flow",
            "image":
            StorageSchema().load(flow_run.flow.storage)  # type: ignore
            .name,
            "command": ["/bin/sh", "-c", "prefect execute cloud-flow"],
            "environment": [
                {
                    "name": "PREFECT__CLOUD__API",
                    "value": config.cloud.api or "https://api.prefect.io",
                },
                {
                    "name": "PREFECT__CLOUD__USE_LOCAL_SECRETS",
                    "value": "false"
                },
                {
                    "name": "PREFECT__LOGGING__LOG_TO_CLOUD",
                    "value": "true"
                },
                {
                    "name": "PREFECT__LOGGING__LEVEL",
                    "value": "DEBUG"
                },
                {
                    "name": "PREFECT__ENGINE__FLOW_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudFlowRunner",
                },
                {
                    "name": "PREFECT__ENGINE__TASK_RUNNER__DEFAULT_CLASS",
                    "value": "prefect.engine.cloud.CloudTaskRunner",
                },
            ],
            "essential":
            True,
        }]

        # Assign repository credentials if they are specified
        if self.repository_credentials:
            container_definitions[0]["repositoryCredentials"] = {
                "credentialsParameter": self.repository_credentials
            }

        # Register task definition
        self.boto3_client.register_task_definition(
            family="prefect-task-{}".format(
                flow_run.flow.id[:8]),  # type: ignore
            containerDefinitions=container_definitions,
            requiresCompatibilities=["FARGATE"],
            networkMode="awsvpc",
            cpu=self.task_cpu,
            memory=self.task_memory,
        )
Ejemplo n.º 24
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(
            flow_run.id)  # type: ignore
                         )

        # 'import docker' is expensive time-wise, we should do this just-in-time to keep
        # the 'import prefect' time low
        import docker

        storage = StorageSchema().load(flow_run.flow.storage)
        if not isinstance(StorageSchema().load(flow_run.flow.storage), Docker):
            self.logger.error(
                "Storage for flow run {} is not of type Docker.".format(
                    flow_run.id))
            raise ValueError("Unsupported Storage type")

        env_vars = self.populate_env_vars(flow_run=flow_run)

        if not self.no_pull and storage.registry_url:
            self.logger.info("Pulling image {}...".format(storage.name))

            pull_output = self.docker_client.pull(storage.name,
                                                  stream=True,
                                                  decode=True)
            for line in pull_output:
                self.logger.debug(line)
            self.logger.info("Successfully pulled image {}...".format(
                storage.name))

        # Create any named volumes (if they do not already exist)
        for named_volume_name in self.named_volumes:
            try:
                self.docker_client.inspect_volume(name=named_volume_name)
            except docker.errors.APIError:
                self.logger.debug(
                    "Creating named volume {}".format(named_volume_name))
                self.docker_client.create_volume(
                    name=named_volume_name,
                    driver="local",
                    labels={"prefect_created": "true"},
                )

        # Create a container
        self.logger.debug("Creating Docker container {}".format(storage.name))

        host_config = {"auto_remove": True}  # type: dict
        container_mount_paths = self.container_mount_paths
        if container_mount_paths:
            host_config.update(binds=self.host_spec)

        if sys.platform.startswith("linux") and self.docker_interface:
            docker_internal_ip = get_docker_ip()
            host_config.update(
                extra_hosts={"host.docker.internal": docker_internal_ip})

        networking_config = None
        if self.network:
            networking_config = self.docker_client.create_networking_config(
                {self.network: self.docker_client.create_endpoint_config()})

        container = self.docker_client.create_container(
            storage.name,
            command="prefect execute cloud-flow",
            environment=env_vars,
            volumes=container_mount_paths,
            host_config=self.docker_client.create_host_config(**host_config),
            networking_config=networking_config,
        )

        # Start the container
        self.logger.debug("Starting Docker container with ID {}".format(
            container.get("Id")))
        if self.network:
            self.logger.debug("Adding container to docker network: {}".format(
                self.network))

        self.docker_client.start(container=container.get("Id"))

        if self.show_flow_logs:
            proc = multiprocessing.Process(
                target=self.stream_container_logs,
                kwargs={"container_id": container.get("Id")},
            )

            proc.start()
            self.processes.append(proc)

        self.logger.debug("Docker container {} started".format(
            container.get("Id")))

        return "Container ID: {}".format(container.get("Id"))
Ejemplo n.º 25
0
    def replace_job_spec_yaml(self, flow_run: GraphQLResult) -> dict:
        """
        Populate metadata and variables in the job_spec.yaml file for flow runs

        Args:
            - flow_run (GraphQLResult): A flow run object

        Returns:
            - dict: a dictionary representing the populated yaml object
        """
        with open(path.join(path.dirname(__file__), "job_spec.yaml"),
                  "r") as job_file:
            job = yaml.safe_load(job_file)

        identifier = str(uuid.uuid4())[:8]
        job_name = "prefect-job-{}".format(identifier)

        # Populate job metadata for identification
        job["metadata"]["name"] = job_name
        job["metadata"]["labels"]["app"] = job_name
        job["metadata"]["labels"]["identifier"] = identifier
        job["metadata"]["labels"]["flow_run_id"] = flow_run.id  # type: ignore
        job["metadata"]["labels"]["flow_id"] = flow_run.flow.id  # type: ignore
        job["spec"]["template"]["metadata"]["labels"]["app"] = job_name
        job["spec"]["template"]["metadata"]["labels"][
            "flow_run_id"] = flow_run.id  # type: ignore
        job["spec"]["template"]["metadata"]["labels"][
            "identifier"] = identifier

        # Use flow storage image for job
        job["spec"]["template"]["spec"]["containers"][0]["image"] = (
            StorageSchema().load(flow_run.flow.storage).name  # type: ignore
        )

        self.logger.debug("Using image {} for job".format(StorageSchema().load(
            flow_run.flow.storage).name  # type: ignore
                                                          ))

        # Populate environment variables for flow run execution
        env = job["spec"]["template"]["spec"]["containers"][0]["env"]

        env[0]["value"] = config.cloud.api or "https://api.prefect.io"
        env[1]["value"] = config.cloud.agent.auth_token
        env[2]["value"] = flow_run.id  # type: ignore
        env[3]["value"] = flow_run.flow.id  # type: ignore
        env[4]["value"] = os.getenv("NAMESPACE", "default")
        env[5]["value"] = str(self.labels)
        env[6]["value"] = str(self.log_to_cloud).lower()

        # append all user provided values
        for key, value in self.env_vars.items():
            env.append(dict(name=key, value=value))

        # Use image pull secrets if provided
        job["spec"]["template"]["spec"]["imagePullSecrets"][0][
            "name"] = os.getenv("IMAGE_PULL_SECRETS", "")

        # Set resource requirements if provided
        resources = job["spec"]["template"]["spec"]["containers"][0][
            "resources"]
        if os.getenv("JOB_MEM_REQUEST"):
            resources["requests"]["memory"] = os.getenv("JOB_MEM_REQUEST")
        if os.getenv("JOB_MEM_LIMIT"):
            resources["limits"]["memory"] = os.getenv("JOB_MEM_LIMIT")
        if os.getenv("JOB_CPU_REQUEST"):
            resources["requests"]["cpu"] = os.getenv("JOB_CPU_REQUEST")
        if os.getenv("JOB_CPU_LIMIT"):
            resources["limits"]["cpu"] = os.getenv("JOB_CPU_LIMIT")

        return job
Ejemplo n.º 26
0
    def deploy_flow(self, flow_run: GraphQLResult) -> str:
        """
        Deploy flow runs on your local machine as Docker containers

        Args:
            - flow_run (GraphQLResult): A GraphQLResult flow run object

        Returns:
            - str: Information about the deployment

        Raises:
            - ValueError: if deployment attempted on unsupported Storage type
        """
        self.logger.info("Deploying flow run {}".format(
            flow_run.id))  # type: ignore

        storage = StorageSchema().load(flow_run.flow.storage)
        if isinstance(storage, Docker):
            self.logger.error(
                "Flow run %s has an unsupported storage type: `%s`",
                flow_run.id,
                type(storage).__name__,
            )
            raise TypeError("Unsupported Storage type: %s" %
                            type(storage).__name__)

        # If the flow is using a run_config, load it
        if getattr(flow_run.flow, "run_config", None) is not None:
            run_config = RunConfigSchema().load(flow_run.flow.run_config)
            if not isinstance(run_config, LocalRun):
                self.logger.error(
                    "Flow run %s has a `run_config` of type `%s`, only `LocalRun` is supported",
                    flow_run.id,
                    type(run_config).__name__,
                )
                raise TypeError("Unsupported RunConfig type: %s" %
                                type(run_config).__name__)
        else:
            run_config = None

        env = self.populate_env_vars(flow_run, run_config=run_config)

        working_dir = None if run_config is None else run_config.working_dir
        if working_dir and not os.path.exists(working_dir):
            msg = f"Flow run {flow_run.id} has a nonexistent `working_dir` configured: {working_dir}"
            self.logger.error(msg)
            raise ValueError(msg)

        stdout = sys.stdout if self.show_flow_logs else DEVNULL

        # note: we will allow these processes to be orphaned if the agent were to exit
        # before the flow runs have completed. The lifecycle of the agent should not
        # dictate the lifecycle of the flow run. However, if the user has elected to
        # show flow logs, these log entries will continue to stream to the users terminal
        # until these child processes exit, even if the agent has already exited.
        p = Popen(
            get_flow_run_command(flow_run).split(" "),
            stdout=stdout,
            stderr=STDOUT,
            env=env,
            cwd=working_dir,
        )

        self.processes.add(p)
        self.logger.debug("Submitted flow run {} to process PID {}".format(
            flow_run.id, p.pid))

        return "PID: {}".format(p.pid)