Exemple #1
0
    def execute(  # type: ignore
            self,
            storage: "Storage",
            flow_location: str,
            **kwargs: Any  # type: ignore
    ) -> None:
        flow_run_info = None
        flow_run_id = prefect.context.get("flow_run_id")
        if self._on_execute:
            # If an on_execute Callable has been provided, retrieve the flow run parameters
            # and then allow the Callable a chance to update _provider_kwargs. This allows
            # better sizing of the cluster resources based on parameters for this Flow run.
            try:
                client = Client()
                flow_run_info = client.get_flow_run_info(flow_run_id)
                parameters = flow_run_info.parameters or {}  # type: ignore
                self._on_execute(parameters, self._provider_kwargs)
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow run info with error: {}".format(
                        repr(exc)))
        if "image" not in self._provider_kwargs or not self._provider_kwargs.get(
                "image"):
            # If image is not specified, use the Flow's image so that dependencies are
            # identical on all containers: Flow runner, Dask scheduler, and Dask workers
            flow_id = prefect.context.get("flow_id")
            try:
                client = Client()
                if not flow_id:  # We've observed cases where flow_id is None
                    if not flow_run_info:
                        flow_run_info = client.get_flow_run_info(flow_run_id)
                    flow_id = flow_run_info.flow_id
                flow_info = client.graphql("""query {
                  flow(where: {id: {_eq: "%s"}}) {
                    storage
                  }
                }""" % flow_id)
                storage_info = flow_info["data"]["flow"][0]["storage"]
                image = "{}/{}:{}".format(
                    storage_info["registry_url"],
                    storage_info["image_name"],
                    storage_info["image_tag"],
                )
                self.logger.info(
                    "Using Flow's Docker image for Dask scheduler & workers: {}"
                    .format(image))
                self._provider_kwargs["image"] = image
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow info with error: {}".format(
                        repr(exc)))

        self._create_dask_cluster()

        self.logger.info(
            "Executing on dynamically created Dask Cluster with scheduler address: {}"
            .format(self.executor_kwargs["address"]))
        super().execute(storage, flow_location, **kwargs)
Exemple #2
0
    def run(
        self,
        flow_name: str = None,
        project_name: str = None,
        parameters: dict = None,
        run_config: RunConfig = None,
        new_flow_context: dict = None,
        run_name: str = None,
        idempotency_key: str = None,
        scheduled_start_time: datetime.datetime = None,
    ) -> str:
        """
        Run method for the task; responsible for scheduling the specified flow run.

        Args:
            - flow_name (str, optional): the name of the flow to schedule; if not provided,
                this method will use the flow name provided at initialization
            - project_name (str, optional): the Cloud project in which the flow is located; if
                not provided, this method will use the project provided at initialization. If
                running with Prefect Core's server as the backend, this should not be provided.
            - parameters (dict, optional): the parameters to pass to the flow run being
                scheduled; if not provided, this method will use the parameters provided at
                initialization
            - run_config (RunConfig, optional): a run-config to use for this flow
                run, overriding any existing flow settings.
            - new_flow_context (dict, optional): the optional run context for the new flow run
            - run_name (str, optional): name to be set for the flow run
            - idempotency_key (str, optional): a unique idempotency key for scheduling the
                flow run. Duplicate flow runs with the same idempotency key will only create
                a single flow run. This is useful for ensuring that only one run is created
                if this task is retried. If not provided, defaults to the active `task_run_id`.
            - scheduled_start_time (datetime, optional): the time to schedule the execution
                for; if not provided, defaults to now

        Returns:
            - str: the ID of the newly-scheduled flow run

        Raises:
            - ValueError: if flow was not provided, cannot be found, or if a project name was
                not provided while using Cloud as a backend

        Example:
            ```python
            from prefect.tasks.prefect.flow_run import StartFlowRun

            kickoff_task = StartFlowRun(project_name="Hello, World!", flow_name="My Cloud Flow")
            ```

        """

        # verify that flow and project names were passed where necessary
        if flow_name is None:
            raise ValueError("Must provide a flow name.")
        if project_name is None:
            raise ValueError("Must provide a project name.")

        where_clause = {
            "name": {
                "_eq": flow_name
            },
            "archived": {
                "_eq": False
            },
            "project": {
                "name": {
                    "_eq": project_name
                }
            },
        }

        # find the flow ID to schedule
        query = {
            "query": {
                with_args(
                    "flow",
                    {
                        "where": where_clause,
                        "order_by": {
                            "version": EnumValue("desc")
                        },
                        "limit": 1,
                    },
                ): {"id"}
            }
        }

        client = Client()
        flow = client.graphql(query).data.flow

        # verify that a flow has been returned
        if not flow:
            raise ValueError("Flow '{}' not found.".format(flow_name))

        # grab the ID for the most recent version
        flow_id = flow[0].id

        if idempotency_key is None:
            idempotency_key = prefect.context.get("task_run_id", None)

        # providing an idempotency key ensures that retries for this task
        # will not create additional flow runs
        flow_run_id = client.create_flow_run(
            flow_id=flow_id,
            parameters=parameters,
            run_config=run_config,
            idempotency_key=idempotency_key,
            context=new_flow_context,
            run_name=run_name,
            scheduled_start_time=scheduled_start_time,
        )

        self.logger.debug(f"Flow Run {flow_run_id} created.")

        self.logger.debug(
            f"Creating link artifact for Flow Run {flow_run_id}.")
        run_link = client.get_cloud_url("flow-run", flow_run_id, as_user=False)
        create_link(urlparse(run_link).path)
        self.logger.info(f"Flow Run: {run_link}")

        if not self.wait:
            return flow_run_id

        while True:
            time.sleep(self.poll_interval.total_seconds())
            flow_run_state = client.get_flow_run_info(flow_run_id).state
            if flow_run_state.is_finished():
                exc = signal_from_state(flow_run_state)(
                    f"{flow_run_id} finished in state {flow_run_state}")
                raise exc
Exemple #3
0
    def execute(  # type: ignore
        self, flow: "Flow", **kwargs: Any  # type: ignore
    ) -> None:
        """
        Execute a flow run on a dask-cloudprovider cluster.

        Args:
            - flow (Flow): the Flow object
            - **kwargs (Any): Unused
        """
        flow_run_info = None
        flow_run_id = prefect.context.get("flow_run_id")
        if self._on_execute:
            # If an on_execute Callable has been provided, retrieve the flow run parameters
            # and then allow the Callable a chance to update _provider_kwargs. This allows
            # better sizing of the cluster resources based on parameters for this Flow run.
            try:
                client = Client()
                flow_run_info = client.get_flow_run_info(flow_run_id)
                parameters = flow_run_info.parameters or {}  # type: ignore
                self._on_execute(parameters, self._provider_kwargs)
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow run info with error: {}".format(repr(exc))
                )
        if "image" not in self._provider_kwargs or not self._provider_kwargs.get(
            "image"
        ):
            # If image is not specified, use the Flow's image so that dependencies are
            # identical on all containers: Flow runner, Dask scheduler, and Dask workers
            flow_id = prefect.context.get("flow_id")
            try:
                client = Client()
                if not flow_id:  # We've observed cases where flow_id is None
                    if not flow_run_info:
                        flow_run_info = client.get_flow_run_info(flow_run_id)
                    flow_id = flow_run_info.flow_id
                flow_info = client.graphql(
                    """query {
                  flow(where: {id: {_eq: "%s"}}) {
                    storage
                  }
                }"""
                    % flow_id
                )
                storage_info = flow_info["data"]["flow"][0]["storage"]
                image = "{}/{}:{}".format(
                    storage_info["registry_url"],
                    storage_info["image_name"],
                    storage_info["image_tag"],
                )
                self.logger.info(
                    "Using Flow's Docker image for Dask scheduler & workers: {}".format(
                        image
                    )
                )
                self._provider_kwargs["image"] = image
            except Exception as exc:
                self.logger.info(
                    "Failed to retrieve flow info with error: {}".format(repr(exc))
                )

        self._create_dask_cluster()

        self.logger.info(
            "Executing on dynamically created Dask Cluster with scheduler address: {}".format(
                self.executor_kwargs["address"]
            )
        )
        if self.on_start:
            self.on_start()

        try:
            from prefect.engine import get_default_flow_runner_class
            from prefect.executors import DaskExecutor

            runner_cls = get_default_flow_runner_class()
            runner_cls(flow=flow).run(executor=DaskExecutor(**self.executor_kwargs))
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc)
            )
            raise
        finally:
            if self.on_exit:
                self.on_exit()