Beispiel #1
0
    async def insert(
        self,
        graphql_type: str,
        objects: List[dict],
        on_conflict: dict = None,
        alias: str = None,
        selection_set: GQLObjectTypes = "affected_rows",
        run_mutation: bool = True,
    ) -> Box:
        """
        Runs an `insert` mutation against the provided Hasura type, evaluating the provided
        `selection_set` and returning the full result.

        The `selection_set` is inserted directly into the graphql query, and should not
        be surrounded by curly braces. Valid top-level keys are `affected_rows` and `returning`.
        """

        if not isinstance(objects, (list, set, tuple)):
            raise TypeError(
                f"`objects` should be a collection; received {type(objects).__name__}"
            )

        alias = alias or "insert"

        # -----------------------------------------------------------
        # create variables

        arguments = {}
        variables = []

        # --- variable: objects

        arguments["objects"] = Variable(
            name=f"{alias}_objects",
            type=f"[{graphql_type}_insert_input!]!",
            value=objects,
        )
        variables.append(arguments["objects"])

        # --- variable: on conflict

        if isinstance(on_conflict, str):
            arguments["on_conflict"] = EnumValue(on_conflict)
        elif on_conflict:
            arguments["on_conflict"] = Variable(
                name=f"{alias}_on_conflict",
                type=f"{graphql_type}_on_conflict",
                value=on_conflict,
            )
            variables.append(arguments["on_conflict"])

        # -------------------------------------------------------------
        # build mutation

        mutation_name = f"{alias}: insert_{graphql_type}"
        selection_set = selection_set or "affected_rows"

        graphql = dict(
            query={with_args(mutation_name, arguments): selection_set},
            variables=variables,
        )

        if run_mutation:
            result = await self.execute_mutations_in_transaction(
                mutations=[graphql])
            return result.data[alias]
        else:
            return graphql
Beispiel #2
0
def logs(name, info):
    """
    Query logs for a flow run.

    \b
    Options:
        --name, -n      TEXT    A flow run name to query        [required]
        --info, -i              Retrieve detailed logging info
    """
    log_query = {
        with_args("logs", {
            "order_by": {
                EnumValue("timestamp"): EnumValue("asc")
            }
        }): {
            "timestamp": True,
            "message": True,
            "level": True,
        },
        "start_time": True,
    }
    if info:
        log_query = {
            with_args("logs", {
                "order_by": {
                    EnumValue("timestamp"): EnumValue("asc")
                }
            }): {
                "timestamp": True,
                "info": True
            },
            "start_time": True,
        }

    query = {
        "query": {
            with_args(
                "flow_run",
                {
                    "where": {
                        "name": {
                            "_eq": name
                        }
                    },
                    "order_by": {
                        EnumValue("start_time"): EnumValue("desc")
                    },
                },
            ):
            log_query
        }
    }

    result = Client().graphql(query)

    flow_run = result.data.flow_run
    if not flow_run:
        click.secho("{} not found".format(name), fg="red")
        return

    run = flow_run[0]
    logs = run.logs
    output = []

    if not info:
        for log in logs:
            output.append([log.timestamp, log.level, log.message])

        click.echo(
            tabulate(
                output,
                headers=["TIMESTAMP", "LEVEL", "MESSAGE"],
                tablefmt="plain",
                numalign="left",
                stralign="left",
            ))
        return

    for log in logs:
        click.echo(log.info)
Beispiel #3
0
    def register(
        self,
        flow: "Flow",
        project_name: str,
        build: bool = True,
        set_schedule_active: bool = True,
        version_group_id: str = None,
        compressed: bool = True,
    ) -> str:
        """
        Push a new flow to Prefect Cloud

        Args:
            - flow (Flow): a flow to register
            - project_name (str): the project that should contain this flow.
            - build (bool, optional): if `True`, the flow's environment is built
                prior to serialization; defaults to `True`
            - set_schedule_active (bool, optional): if `False`, will set the
                schedule to inactive in the database to prevent auto-scheduling runs (if the Flow has a schedule).
                Defaults to `True`. This can be changed later.
            - version_group_id (str, optional): the UUID version group ID to use for versioning this Flow
                in Cloud; if not provided, the version group ID associated with this Flow's project and name
                will be used.
            - compressed (bool, optional): if `True`, the serialized flow will be; defaults to `True`
                compressed

        Returns:
            - str: the ID of the newly-registered flow

        Raises:
            - ClientError: if the register failed
        """
        required_parameters = {p for p in flow.parameters() if p.required}
        if flow.schedule is not None and required_parameters:
            raise ClientError(
                "Flows with required parameters can not be scheduled automatically."
            )
        if any(e.key for e in flow.edges) and flow.result_handler is None:
            raise ClientError(
                "Flows are required to have a result handler for storing inputs and outputs."
            )
        if compressed:
            create_mutation = {
                "mutation($input: createFlowFromCompressedStringInput!)": {
                    "createFlowFromCompressedString(input: $input)": {"id"}
                }
            }
        else:
            create_mutation = {
                "mutation($input: createFlowInput!)": {
                    "createFlow(input: $input)": {"id"}
                }
            }

        query_project = {
            "query": {
                with_args("project", {
                    "where": {
                        "name": {
                            "_eq": project_name
                        }
                    }
                }): {
                    "id": True
                }
            }
        }

        project = self.graphql(query_project).data.project  # type: ignore

        if not project:
            raise ValueError(
                'Project {} not found. Run `client.create_project("{}")` to create it.'
                .format(project_name, project_name))

        serialized_flow = flow.serialize(build=build)  # type: Any

        # verify that the serialized flow can be deserialized
        try:
            prefect.serialization.flow.FlowSchema().load(serialized_flow)
        except Exception as exc:
            raise ValueError(
                "Flow could not be deserialized successfully. Error was: {}".
                format(repr(exc)))

        if compressed:
            serialized_flow = compress(serialized_flow)
        res = self.graphql(
            create_mutation,
            variables=dict(input=dict(
                projectId=project[0].id,
                serializedFlow=serialized_flow,
                setScheduleActive=set_schedule_active,
                versionGroupId=version_group_id,
            )),
        )  # type: Any

        flow_id = (res.data.createFlowFromCompressedString.id
                   if compressed else res.data.createFlow.id)
        return flow_id
Beispiel #4
0
    def _get_flow_run_metadata(
        self,
        flow_run_ids: Iterable[str],
    ) -> List["GraphQLResult"]:
        """
        Get metadata about a collection of flow run ids that the agent is preparing
        to submit

        This function will filter the flow runs to a collection where:

        - The flow run is in a 'Scheduled' state. This prevents flow runs that have
          been submitted by another agent from being submitted again.

        - The flow run is in another state, but has task runs in a 'Running' state
          scheduled to start now. This is for retries in which the flow run is placed
          back into the ready queue but is not in a Scheduled state.

        Args:
            flow_run_ids: Flow run ids to query (order will not be respected)
            start_time: Only

        Returns:
           List: Metadata per flow run sorted by scheduled start time (ascending)
        """
        if not flow_run_ids:
            return []

        flow_run_ids = list(flow_run_ids)
        self.logger.debug(
            f"Retrieving metadata for {len(flow_run_ids)} flow run(s)...")

        # This buffer allows flow runs to retry immediately in their own deployment
        # without the agent creating a second deployment
        retry_start_time_buffer = pendulum.now("UTC").subtract(
            seconds=3).isoformat()

        where = {
            # Only get flow runs in the requested set
            "id": {
                "_in": flow_run_ids
            },
            # and filter by the additional criteria...
            "_or": [
                # This flow run has not been taken by another agent
                {
                    "state": {
                        "_eq": "Scheduled"
                    }
                },
                # Or, this flow run has been set to retry and has not been immediately
                # retried in its own process
                {
                    "state": {
                        "_eq": "Running"
                    },
                    "task_runs": {
                        "state_start_time": {
                            "_lte": retry_start_time_buffer
                        }
                    },
                },
            ],
        }

        query = {
            "query": {
                with_args("flow_run", {"where": where}): {
                    "id": True,
                    "version": True,
                    "state": True,
                    "serialized_state": True,
                    "parameters": True,
                    "scheduled_start_time": True,
                    "run_config": True,
                    "name": True,
                    "flow": {
                        "id",
                        "name",
                        "environment",
                        "storage",
                        "version",
                        "core_version",
                    },
                    # Collect and return task run metadata as well so the state can be
                    # updated in `_mark_flow_as_submitted`
                    with_args(
                        "task_runs",
                        {
                            "where": {
                                "state_start_time": {
                                    "_lte": retry_start_time_buffer
                                }
                            }
                        },
                    ): {"id", "version", "task_id", "serialized_state"},
                }
            }
        }
        result = self.client.graphql(query)
        return sorted(
            result.data.flow_run,
            key=lambda flow_run: flow_run.serialized_state.get(
                "start_time",
                pendulum.now("utc").isoformat()),
        )
Beispiel #5
0
def flow_runs(limit, flow, project, started):
    """
    Query information regarding Prefect flow runs.

    \b
    Options:
        --limit, l          INTEGER A limit amount of flow runs to query, defaults to 10
        --flow, -f          TEXT    Name of a flow to query for runs
        --project, -p       TEXT    Name of a project to query
        --started, -s               Only retrieve started flow runs, default shows `Scheduled` runs
    """

    if started:
        order = {"start_time": EnumValue("desc")}

        where = {
            "_and": {
                "flow": {
                    "_and": {
                        "name": {
                            "_eq": flow
                        },
                    }
                },
                "start_time": {
                    "_is_null": False
                },
            }
        }

        if project:
            where["_and"]["flow"]["_and"]["project"] = {
                "name": {
                    "_eq": project
                }
            }
    else:
        order = {"created": EnumValue("desc")}

        where = {
            "flow": {
                "_and": {
                    "name": {
                        "_eq": flow
                    },
                }
            }
        }

        if project:
            where["flow"]["_and"]["project"] = {"name": {"_eq": project}}

    query = {
        "query": {
            with_args("flow_run", {
                "where": where,
                "limit": limit,
                "order_by": order
            }): {
                "flow": {
                    "name": True
                },
                "created": True,
                "state": True,
                "name": True,
                "duration": True,
                "start_time": True,
            }
        }
    }

    result = Client().graphql(query)

    flow_run_data = result.data.flow_run

    output = []
    for item in flow_run_data:
        start_time = (pendulum.parse(item.start_time).to_datetime_string()
                      if item.start_time else None)
        output.append([
            item.name,
            item.flow.name,
            item.state,
            pendulum.parse(item.created).diff_for_humans(),
            start_time,
            item.duration,
        ])

    click.echo(
        tabulate(
            output,
            headers=[
                "NAME", "FLOW NAME", "STATE", "AGE", "START TIME", "DURATION"
            ],
            tablefmt="plain",
            numalign="left",
            stralign="left",
        ))
Beispiel #6
0
async def schedule_flow_runs(flow_id: str, max_runs: int = 10) -> List[str]:
    """
    Schedule the next `max_runs` runs for this flow. Runs will not be scheduled
    if they are earlier than latest currently-scheduled run that has auto_scheduled = True.

    Runs are created with an idempotency key to avoid rescheduling.

    Args:
        - flow_id (str): the flow ID
        - max_runs (int): the maximum number of runs to schedule (defaults to 10)

    Returns:
        - List[str]: the ids of the new runs
    """
    if flow_id is None:
        raise ValueError("Invalid flow id.")

    run_ids = []

    flow = await models.Flow.where({
        # match the provided ID
        "id": {
            "_eq": flow_id
        },
        # schedule is not none or flow group schedule is not none
        "_or": [
            {
                "schedule": {
                    "_is_null": False
                }
            },
            {
                "flow_group": {
                    "schedule": {
                        "_is_null": False
                    }
                }
            },
        ],
        # schedule is active
        "is_schedule_active": {
            "_eq": True
        },
        # flow is not archived
        "archived": {
            "_eq": False
        },
    }).first(
        {
            "schedule": True,
            "flow_group": {
                "schedule": True
            },
            with_args("flow_runs_aggregate", {
                "where": {
                    "auto_scheduled": {
                        "_eq": True
                    }
                }
            }): {
                "aggregate": {
                    "max": "scheduled_start_time"
                }
            },
        },
        apply_schema=False,
    )

    if not flow:
        logger.debug(f"Flow {flow_id} can not be scheduled.")
        return run_ids
    else:
        # attempt to pull the schedule from the flow group if possible
        # if not possible, pull the schedule from the flow
        flow_schedule = flow.flow_group.schedule or flow.schedule

        try:
            flow_schedule = schedule_schema.load(flow_schedule)
        except Exception as exc:
            logger.error(exc, exc_info=True)
            logger.critical(
                f"Failed to deserialize schedule for flow {flow_id}: {flow_schedule}"
            )
            return run_ids

    if flow.flow_runs_aggregate.aggregate.max.scheduled_start_time is not None:
        last_scheduled_run = pendulum.parse(
            flow.flow_runs_aggregate.aggregate.max.scheduled_start_time)
    else:
        last_scheduled_run = pendulum.now("UTC")

    schedule_coros = []

    # schedule every event with an idempotent flow run
    for event in flow_schedule.next(n=max_runs, return_events=True):

        # if the event has parameter defaults or labels, we do allow for
        # same-time scheduling
        if event.parameter_defaults or event.labels is not None:
            md5 = hashlib.md5()
            param_string = str(sorted(json.dumps(event.parameter_defaults)))
            label_string = str(sorted(json.dumps(event.labels)))
            md5.update((param_string + label_string).encode("utf-8"))
            idempotency_key = (
                f"auto-scheduled:{event.start_time.in_tz('UTC')}:{md5.hexdigest()}"
            )
        # if this run was already scheduled, continue
        elif last_scheduled_run and event.start_time <= last_scheduled_run:
            continue
        else:
            idempotency_key = f"auto-scheduled:{event.start_time.in_tz('UTC')}"

        schedule_coros.append(
            api.runs.create_flow_run(
                flow_id=flow_id,
                scheduled_start_time=event.start_time,
                parameters=event.parameter_defaults,
                labels=event.labels,
                idempotency_key=idempotency_key,
            ))

    # schedule runs concurrently
    run_ids.extend(await asyncio.gather(*schedule_coros))

    await models.FlowRun.where({
        "id": {
            "_in": run_ids
        }
    }).update(set={"auto_scheduled": True})

    return run_ids
Beispiel #7
0
    async def run_scheduled(self, flow_id=None):
        """
        Queries for any flow runs that are SCHEDULED, OR any flow runs that have SCHEDULED
        task runs [if the flow run itself is RUNNING]. Sets all Scheduled runs to Submitted
        and runs the flow.

        If a flow_id is provided, only flow runs of that flow are matched.
        """
        now = pendulum.now()
        flow_runs = await models.FlowRun.where({
            "_or": [
                {
                    "state_start_time": {
                        "_lte": str(now)
                    }
                },
                {
                    "state": {
                        "_eq": "Running"
                    },
                    "task_runs": {
                        "state_start_time": {
                            "_lte": str(now)
                        }
                    },
                },
            ],
            "flow_id": {
                "_eq": flow_id
            },
        }).get(
            selection_set={
                "id": True,
                "version": True,
                "tenant_id": True,
                "state": True,
                "serialized_state": True,
                "parameters": True,
                "flow": {"id", "environment", "name", "storage"},
                with_args("task_runs", {
                    "where": {
                        "state_start_time": {
                            "_lte": str(now)
                        }
                    }
                }): {"id", "version", "task_id", "serialized_state"},
            },
            limit=100,
            order_by={"state_start_time": EnumValue("asc")},
        )
        for fr in flow_runs:

            skip_counter = 0

            fr_serialized_state = state_schema.load(fr.serialized_state)

            # set the flow run state to submitted, if it's scheduled
            if fr_serialized_state.is_scheduled():
                try:
                    await api.states.set_flow_run_state(
                        flow_run_id=fr.id,
                        state=Submitted(
                            message="Submitted for execution",
                            state=fr_serialized_state,
                        ),
                    )
                except ValueError as exc:
                    skip_counter += 1
                    if "Update failed" not in str(exc):
                        raise

            # set each task run state to submitted, if it's scheduled
            for tr in fr.task_runs:
                tr_serialized_state = state_schema.load(tr.serialized_state)

                try:
                    await api.states.set_task_run_state(
                        task_run_id=tr.id,
                        state=Submitted(
                            message="Submitted for execution",
                            state=tr_serialized_state,
                        ),
                    )
                except ValueError as exc:
                    skip_counter += 1
                    if "Update failed" not in str(exc):
                        raise

            # none of the states were set, so we shouldn't bother running
            if skip_counter == 1 + len(fr.task_runs):
                continue

            self.logger.info(f'Submitting flow run "{fr.id}" for execution.')

            # run the flow
            self.run_flow(
                flow_name=fr.flow.name,
                storage=storage_schema.load(fr.flow.storage),
                environment=environment_schema.load(fr.flow.environment),
                config={
                    "cloud.api":
                    f"http://localhost:4200",
                    "cloud.graphql":
                    "http://localhost:4200",
                    "engine.flow_runner.default_class":
                    "prefect.engine.cloud.CloudFlowRunner",
                    "engine.task_runner.default_class":
                    "prefect.engine.cloud.CloudTaskRunner",
                    "engine.executor.default_class":
                    "prefect.engine.executors.LocalExecutor",
                },
                context={"flow_run_id": fr.id},
            )
Beispiel #8
0
def register_serialized_flow(
    client: "prefect.Client",
    serialized_flow: dict,
    project_id: str,
    force: bool = False,
) -> Tuple[str, int, bool]:
    """Register a pre-serialized flow.

    Args:
        - client (prefect.Client): the prefect client
        - serialized_flow (dict): the serialized flow
        - project_id (str): the project id
        - force (bool, optional): If `False` (default), an idempotency key will
            be generated to avoid unnecessary re-registration. Set to `True` to
            force re-registration.

    Returns:
        - flow_id (str): the flow id
        - flow_version (int): the flow version
        - is_new (bool): True if this is a new flow version, false if
            re-registration was skipped.
    """
    # Get most recent flow id for this flow. This can be removed once
    # the registration graphql routes return more information
    flow_name = serialized_flow["name"]
    resp = client.graphql(
        {
            "query": {
                with_args(
                    "flow",
                    {
                        "where": {
                            "_and": {
                                "name": {"_eq": flow_name},
                                "project": {"id": {"_eq": project_id}},
                            }
                        },
                        "order_by": {"version": EnumValue("desc")},
                        "limit": 1,
                    },
                ): {"id", "version"}
            }
        }
    )
    if resp.data.flow:
        prev_id = resp.data.flow[0].id
        prev_version = resp.data.flow[0].version
    else:
        prev_id = None
        prev_version = 0

    inputs = dict(
        project_id=project_id,
        serialized_flow=compress(serialized_flow),
    )
    if not force:
        inputs["idempotency_key"] = hashlib.sha256(
            json.dumps(serialized_flow, sort_keys=True).encode()
        ).hexdigest()

    res = client.graphql(
        {
            "mutation($input: create_flow_from_compressed_string_input!)": {
                "create_flow_from_compressed_string(input: $input)": {"id"}
            }
        },
        variables=dict(input=inputs),
        retry_on_api_error=False,
    )

    new_id = res.data.create_flow_from_compressed_string.id

    if new_id == prev_id:
        return new_id, prev_version, False
    else:
        return new_id, prev_version + 1, True
            self.heartbeat_cmd = [
                sys.executable,
                "-m",
                "prefect",
                "heartbeat",
                "flow-run",
                "-i",
                flow_run_id,
            ]
=======
            self.heartbeat_cmd = ["prefect", "heartbeat", "flow-run", "-i", flow_run_id]
>>>>>>> prefect clone

            query = {
                "query": {
                    with_args("flow_run_by_pk", {"id": flow_run_id}): {
                        "flow": {"settings": True},
                    }
                }
            }
            flow_run = self.client.graphql(query).data.flow_run_by_pk
            if not flow_run.flow.settings.get("heartbeat_enabled", True):
                return False
            return True
        except Exception:
            self.logger.exception(
                "Heartbeat failed for Flow '{}'".format(self.flow.name)
            )
            return False

    def call_runner_target_handlers(self, old_state: State, new_state: State) -> State:
Beispiel #10
0
    def run_flow(self) -> None:
        """
        Run the flow using the default executor

        Raises:
            - ValueError: if no `flow_run_id` is found in context
        """
        # Call on_start callback if specified
        if self.on_start:
            self.on_start()

        try:
            from prefect.engine import (
                get_default_flow_runner_class,
                get_default_executor_class,
            )

            flow_run_id = prefect.context.get("flow_run_id")

            if not flow_run_id:
                raise ValueError("No flow run ID found in context.")

            query = {
                "query": {
                    with_args("flow_run", {"where": {"id": {"_eq": flow_run_id}}}): {
                        "flow": {"name": True, "storage": True,},
                    }
                }
            }

            client = Client()
            result = client.graphql(query)
            flow_run = result.data.flow_run[0]

            flow_data = flow_run.flow
            storage_schema = prefect.serialization.storage.StorageSchema()
            storage = storage_schema.load(flow_data.storage)

            ## populate global secrets
            secrets = prefect.context.get("secrets", {})
            for secret in storage.secrets:
                secrets[secret] = prefect.tasks.secrets.PrefectSecret(name=secret).run()

            with prefect.context(secrets=secrets):
                flow = storage.get_flow(storage.flows[flow_data.name])
                runner_cls = get_default_flow_runner_class()
                if getattr(self, "executor", None) is not None:
                    executor = self.executor  # type: ignore
                else:
                    executor_cls = get_default_executor_class()
                    # Deprecated, to be removed
                    if hasattr(self, "executor_kwargs"):
                        executor = executor_cls(**self.executor_kwargs)  # type: ignore
                    else:
                        executor = executor_cls
                runner_cls(flow=flow).run(executor=executor)
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc)
            )
            raise exc
        finally:
            # Call on_exit callback if specified
            if self.on_exit:
                self.on_exit()
Beispiel #11
0
def logs(name, id, info):
    """
    Query logs for a flow run.

    Note: at least one of `name` or `id` must be specified. If only `name` is set then
    the most recent flow run with that name will be queried.


    \b
    Options:
        --name, -n      TEXT    A flow run name to query
        --id            TEXT    A flow run ID to query
        --info, -i              Retrieve detailed logging info
    """
    if not name and not id:
        click.secho("Either --name or --id must be provided", fg="red")
        return

    log_query = {
        with_args("logs", {"order_by": {EnumValue("timestamp"): EnumValue("asc")}}): {
            "timestamp": True,
            "message": True,
            "level": True,
        },
        "start_time": True,
    }
    if info:
        log_query = {
            with_args(
                "logs", {"order_by": {EnumValue("timestamp"): EnumValue("asc")}}
            ): {"timestamp": True, "info": True},
            "start_time": True,
        }

    query = {
        "query": {
            with_args(
                "flow_run",
                {
                    "where": {"name": {"_eq": name}, "id": {"_eq": id}},
                    "order_by": {EnumValue("start_time"): EnumValue("desc")},
                },
            ): log_query
        }
    }

    result = Client().graphql(query)

    flow_run = result.data.flow_run
    if not flow_run:
        click.secho("{} not found".format(name), fg="red")
        return

    run = flow_run[0]
    logs = run.logs
    output = []

    if not info:
        for log in logs:
            output.append([log.timestamp, log.level, log.message])

        click.echo(
            tabulate(
                output,
                headers=["TIMESTAMP", "LEVEL", "MESSAGE"],
                tablefmt="plain",
                numalign="left",
                stralign="left",
            )
        )
        return

    for log in logs:
        click.echo(log.info)
Beispiel #12
0
from prefect import Client
from prefect.utilities.graphql import with_args

c = Client()

name = "my_flow"
c.graphql({"query": {with_args("flow", {"where": {"name": {"_eq": name}}}): "id"}})

# c.graphql({"query": "'query' {'flow'('where': { 'name': { '_eq': 'ltest' } }) {'id'}}"})
Beispiel #13
0
    def run(
        self,
        flow_name: str = None,
        project_name: str = None,
        parameters: dict = None,
        run_config: RunConfig = None,
        new_flow_context: dict = None,
        run_name: str = None,
        idempotency_key: str = None,
        scheduled_start_time: datetime.datetime = None,
    ) -> str:
        """
        Run method for the task; responsible for scheduling the specified flow run.

        Args:
            - flow_name (str, optional): the name of the flow to schedule; if not provided,
                this method will use the flow name provided at initialization
            - project_name (str, optional): the Cloud project in which the flow is located; if
                not provided, this method will use the project provided at initialization. If
                running with Prefect Core's server as the backend, this should not be provided.
            - parameters (dict, optional): the parameters to pass to the flow run being
                scheduled; if not provided, this method will use the parameters provided at
                initialization
            - run_config (RunConfig, optional): a run-config to use for this flow
                run, overriding any existing flow settings.
            - new_flow_context (dict, optional): the optional run context for the new flow run
            - run_name (str, optional): name to be set for the flow run
            - idempotency_key (str, optional): a unique idempotency key for scheduling the
                flow run. Duplicate flow runs with the same idempotency key will only create
                a single flow run. This is useful for ensuring that only one run is created
                if this task is retried. If not provided, defaults to the active `task_run_id`.
            - scheduled_start_time (datetime, optional): the time to schedule the execution
                for; if not provided, defaults to now

        Returns:
            - str: the ID of the newly-scheduled flow run

        Raises:
            - ValueError: if flow was not provided, cannot be found, or if a project name was
                not provided while using Cloud as a backend

        Example:
            ```python
            from prefect.tasks.prefect.flow_run import StartFlowRun

            kickoff_task = StartFlowRun(project_name="Hello, World!", flow_name="My Cloud Flow")
            ```

        """

        # verify that flow and project names were passed where necessary
        if flow_name is None:
            raise ValueError("Must provide a flow name.")
        if project_name is None:
            raise ValueError("Must provide a project name.")

        where_clause = {
            "name": {"_eq": flow_name},
            "archived": {"_eq": False},
            "project": {"name": {"_eq": project_name}},
        }

        # find the flow ID to schedule
        query = {
            "query": {
                with_args(
                    "flow",
                    {
                        "where": where_clause,
                        "order_by": {"version": EnumValue("desc")},
                        "limit": 1,
                    },
                ): {"id"}
            }
        }

        client = Client()
        flow = client.graphql(query).data.flow

        # verify that a flow has been returned
        if not flow:
            raise ValueError("Flow '{}' not found.".format(flow_name))

        # grab the ID for the most recent version
        flow_id = flow[0].id

        if idempotency_key is None:
            idempotency_key = context.get("task_run_id", None)

        # providing an idempotency key ensures that retries for this task
        # will not create additional flow runs
        flow_run_id = client.create_flow_run(
            flow_id=flow_id,
            parameters=parameters,
            run_config=run_config,
            idempotency_key=idempotency_key,
            context=new_flow_context,
            run_name=run_name,
            scheduled_start_time=scheduled_start_time,
        )

        self.logger.debug(f"Flow Run {flow_run_id} created.")

        self.logger.debug(f"Creating link artifact for Flow Run {flow_run_id}.")
        run_link = client.get_cloud_url("flow-run", flow_run_id, as_user=False)
        create_link(urlparse(run_link).path)

        if not self.wait:
            return flow_run_id

        while True:
            time.sleep(10)
            flow_run_state = client.get_flow_run_info(flow_run_id).state
            if flow_run_state.is_finished():
                exc = signal_from_state(flow_run_state)(
                    f"{flow_run_id} finished in state {flow_run_state}"
                )
                raise exc
Beispiel #14
0
    async def update(
        self,
        graphql_type: str,
        where: GQLObjectTypes = None,
        id: str = None,
        set: GQLObjectTypes = None,
        increment: GQLObjectTypes = None,
        alias: str = None,
        selection_set: GQLObjectTypes = "affected_rows",
        run_mutation: bool = True,
    ) -> Box:
        """
        Runs an `update` mutation against the provided Hasura type and `where` clause, applying
        the operations (either `set` or `increment`)
        evaluating the provided `selection_set` and returning the full result.

        The `selection_set` is inserted directly into the graphql query, and should not
        be surrounded by curly braces. Valid top-level keys are `affected_rows` and `returning`.
        """
        if id is None and not isinstance(where, dict):
            raise TypeError(
                "`where` must be provided as a dict if `id` is None; "
                f"received {type(where).__name__}")
        elif all(op is None for op in [set, increment]):
            raise ValueError("At least one update operation must be provided")

        where = where or {}

        if id is not None:
            where["id"] = {"_eq": id}

        alias = alias or "update"

        # -------------------------------------------------------------
        # create variables

        arguments = {}
        variables = []

        # --- variable: where

        arguments["where"] = Variable(name=f"{alias}_where",
                                      type=f"{graphql_type}_bool_exp!",
                                      value=where)
        variables.append(arguments["where"])

        # --- variable: _set

        if set:
            arguments["_set"] = Variable(name=f"{alias}_set",
                                         type=f"{graphql_type}_set_input",
                                         value=set)
            variables.append(arguments["_set"])

        # --- variable: _inc

        if increment:
            arguments["_inc"] = Variable(name=f"{alias}_inc",
                                         type=f"{graphql_type}_inc_input",
                                         value=increment)
            variables.append(arguments["_inc"])

        # -------------------------------------------------------------
        # build mutation

        mutation_name = f"{alias}: update_{graphql_type}"
        selection_set = selection_set or "affected_rows"
        graphql = dict(
            query={with_args(mutation_name, arguments): selection_set},
            variables=variables,
        )

        if run_mutation:
            result = await self.execute_mutations_in_transaction(
                mutations=[graphql])
            return result.data[alias]
        else:
            return graphql
Beispiel #15
0
    def run(
        self,
        flow_name: str = None,
        project_name: str = None,
        parameters: dict = None,
        idempotency_key: str = None,
        new_flow_context: dict = None,
        run_name: str = None,
    ) -> str:
        """
        Run method for the task; responsible for scheduling the specified flow run.

        Args:
            - flow_name (str, optional): the name of the flow to schedule; if not provided,
                this method will use the flow name provided at initialization
            - project_name (str, optional): the Cloud project in which the flow is located; if
                not provided, this method will use the project provided at initialization. If
                running with Prefect Core's server as the backend, this should not be provided.
            - parameters (dict, optional): the parameters to pass to the flow run being
                scheduled; if not provided, this method will use the parameters provided at
                initialization
            - idempotency_key (str, optional): an optional idempotency key for scheduling the
                flow run; if provided, ensures that only one run is created if this task is retried
                or rerun with the same inputs.  If not provided, the current flow run ID will be used.
            - new_flow_context (dict, optional): the optional run context for the new flow run
            - run_name (str, optional): name to be set for the flow run

        Returns:
            - str: the ID of the newly-scheduled flow run

        Raises:
            - ValueError: if flow was not provided, cannot be found, or if a project name was
                not provided while using Cloud as a backend

        Example:
            ```python
            from prefect.tasks.prefect.flow_run import FlowRunTask

            kickoff_task = FlowRunTask(project_name="Hello, World!", flow_name="My Cloud Flow")
            ```

        """

        # verify that flow and project names were passed where necessary
        if flow_name is None:
            raise ValueError("Must provide a flow name.")
        if project_name is None:
            raise ValueError("Must provide a project name.")

        where_clause = {
            "name": {"_eq": flow_name},
            "archived": {"_eq": False},
            "project": {"name": {"_eq": project_name}},
        }

        # find the flow ID to schedule
        query = {
            "query": {
                with_args(
                    "flow",
                    {
                        "where": where_clause,
                        "order_by": {"version": EnumValue("desc")},
                        "limit": 1,
                    },
                ): {"id"}
            }
        }

        client = Client()
        flow = client.graphql(query).data.flow

        # verify that a flow has been returned
        if not flow:
            raise ValueError("Flow '{}' not found.".format(flow_name))

        # grab the ID for the most recent version
        flow_id = flow[0].id

        idem_key = None
        if context.get("flow_run_id"):
            map_index = context.get("map_index")
            default = context.get("flow_run_id") + (
                f"-{map_index}" if map_index else ""
            )
            idem_key = idempotency_key or default

        # providing an idempotency key ensures that retries for this task
        # will not create additional flow runs
        flow_run_id = client.create_flow_run(
            flow_id=flow_id,
            parameters=parameters,
            idempotency_key=idem_key or idempotency_key,
            context=new_flow_context,
            run_name=run_name,
        )

        self.logger.debug(f"Flow Run {flow_run_id} created.")

        if not self.wait:
            return flow_run_id

        while True:
            time.sleep(10)
            flow_run_state = client.get_flow_run_info(flow_run_id).state
            if flow_run_state.is_finished():
                exc = signal_from_state(flow_run_state)(
                    f"{flow_run_id} finished in state {flow_run_state}"
                )
                raise exc
Beispiel #16
0
    def get_logs(
        self,
        start_time: pendulum.DateTime = None,
        end_time: pendulum.DateTime = None,
    ) -> List["FlowRunLog"]:
        """
        Get logs for this flow run from `start_time` to `end_time`.

        Args:
            - start_time (optional): A time to start the log query at, useful for
                limiting the scope. If not provided, all logs up to `updated_at` are
                retrieved.
            - end_time (optional): A time to end the log query at. By default, this is
                set to `self.updated_at` which is the last time that the flow run was
                updated in the backend before this object was created.

        Returns:
            A list of `FlowRunLog` objects sorted by timestamp
        """

        client = prefect.Client()
        end_time = end_time or self.updated_at

        logs_query = {
            with_args(
                "logs",
                {
                    "order_by": {
                        EnumValue("timestamp"): EnumValue("asc")
                    },
                    "where": {
                        "_and": [
                            {
                                "timestamp": {
                                    "_lte": end_time.isoformat()
                                }
                            },
                            ({
                                "timestamp": {
                                    "_gt": start_time.isoformat()
                                }
                            } if start_time else {}),
                        ]
                    },
                },
            ): {
                "timestamp": True,
                "message": True,
                "level": True
            }
        }

        result = client.graphql({
            "query": {
                with_args(
                    "flow_run",
                    {
                        "where": {
                            "id": {
                                "_eq": self.flow_run_id
                            }
                        },
                    },
                ):
                logs_query
            }
        })

        # Unpack the result
        logs = result.get("data", {}).get("flow_run", [{}])[0].get("logs", [])

        return [FlowRunLog.from_dict(log) for log in logs]
Beispiel #17
0
    def deploy(
        self,
        flow: "Flow",
        project_name: str,
        build: bool = True,
        set_schedule_active: bool = True,
        compressed: bool = True,
    ) -> str:
        """
        Push a new flow to Prefect Cloud

        Args:
            - flow (Flow): a flow to deploy
            - project_name (str): the project that should contain this flow.
            - build (bool, optional): if `True`, the flow's environment is built
                prior to serialization; defaults to `True`
            - set_schedule_active (bool, optional): if `False`, will set the
                schedule to inactive in the database to prevent auto-scheduling runs (if the Flow has a schedule).
                Defaults to `True`. This can be changed later.
            - compressed (bool, optional): if `True`, the serialized flow will be; defaults to `True`
                compressed

        Returns:
            - str: the ID of the newly-deployed flow

        Raises:
            - ClientError: if the deploy failed
        """
        required_parameters = {p for p in flow.parameters() if p.required}
        if flow.schedule is not None and required_parameters:
            raise ClientError(
                "Flows with required parameters can not be scheduled automatically."
            )
        if compressed:
            create_mutation = {
                "mutation($input: createFlowFromCompressedStringInput!)": {
                    "createFlowFromCompressedString(input: $input)": {"id"}
                }
            }
        else:
            create_mutation = {
                "mutation($input: createFlowInput!)": {
                    "createFlow(input: $input)": {"id"}
                }
            }

        query_project = {
            "query": {
                with_args("project", {"where": {"name": {"_eq": project_name}}}): {
                    "id": True
                }
            }
        }

        project = self.graphql(query_project).data.project  # type: ignore

        if not project:
            raise ValueError(
                "Project {} not found. Run `client.create_project({})` to create it.".format(
                    project_name, project_name
                )
            )

        serialized_flow = flow.serialize(build=build)  # type: Any
        if compressed:
            serialized_flow = compress(serialized_flow)
        res = self.graphql(
            create_mutation,
            input=dict(
                projectId=project[0].id,
                serializedFlow=serialized_flow,
                setScheduleActive=set_schedule_active,
            ),
        )  # type: Any

        flow_id = (
            res.data.createFlowFromCompressedString.id
            if compressed
            else res.data.createFlow.id
        )
        return flow_id
Beispiel #18
0
def flow_run():
    """
    Execute a flow run in the context of a backend API.
    """
    flow_run_id = prefect.context.get("flow_run_id")
    if not flow_run_id:
        click.echo("Not currently executing a flow within a Cloud context.")
        raise Exception(
            "Not currently executing a flow within a Cloud context.")

    query = {
        "query": {
            with_args("flow_run", {"where": {
                "id": {
                    "_eq": flow_run_id
                }
            }}): {
                "flow": {
                    "name": True,
                    "storage": True
                },
                "version": True,
            }
        }
    }

    client = Client()
    result = client.graphql(query)
    flow_run = result.data.flow_run

    if not flow_run:
        click.echo("Flow run {} not found".format(flow_run_id))
        raise ValueError("Flow run {} not found".format(flow_run_id))

    try:
        flow_data = flow_run[0].flow
        storage_schema = prefect.serialization.storage.StorageSchema()
        storage = storage_schema.load(flow_data.storage)

        # populate global secrets
        secrets = prefect.context.get("secrets", {})
        for secret in storage.secrets:
            secrets[secret] = PrefectSecret(name=secret).run()

        with prefect.context(secrets=secrets, loading_flow=True):
            flow = storage.get_flow(storage.flows[flow_data.name])

        with prefect.context(secrets=secrets):
            if getattr(flow, "run_config", None) is not None:
                runner_cls = get_default_flow_runner_class()
                runner_cls(flow=flow).run()
            else:
                environment = flow.environment
                environment.setup(flow)
                environment.execute(flow)
    except Exception as exc:
        msg = "Failed to load and execute Flow's environment: {}".format(
            repr(exc))
        state = prefect.engine.state.Failed(message=msg)
        client.set_flow_run_state(flow_run_id=flow_run_id, state=state)
        click.echo(str(exc))
        raise exc
Beispiel #19
0
    def run(self,
            flow_name: str = None,
            project_name: str = None,
            parameters: dict = None) -> str:
        """
        Run method for the task; responsible for scheduling the specified flow run.

        Args:
            - flow_name (str, optional): the name of the flow to schedule; if not provided, this method will
                use the flow name provided at initialization
            - project_name (str, optional): the Cloud project in which the flow is located; if not provided, this method
                will use the project provided at initialization. If running with Prefect Core's server as the backend,
                this should not be provided.
            - parameters (dict, optional): the parameters to pass to the flow run being scheduled; if not provided,
                this method will use the parameters provided at initialization

        Returns:
            - str: the ID of the newly-scheduled flow run

        Raises:
            - ValueError: if flow was not provided, cannot be found, or if a project name was not provided while using
                Cloud as a backend

        Example:
            ```python
            from prefect.tasks.prefect.flow_run import FlowRunTask

            kickoff_task = FlowRunTask(project_name="Hello, World!", flow_name="My Cloud Flow")
            ```

        """
        # verify that flow and project names were passed where necessary
        if flow_name is None:
            raise ValueError("Must provide a flow name.")
        if project_name is None and config.backend == "cloud":
            raise ValueError("Must provide a project name.")

        where_clause = {
            "name": {
                "_eq": flow_name
            },
            "archived": {
                "_eq": False
            },
        }

        if project_name:
            where_clause["project"] = {"name": {"_eq": project_name}}

        # find the flow ID to schedule
        query = {
            "query": {
                with_args(
                    "flow",
                    {
                        "where": where_clause,
                        "order_by": {
                            "version": EnumValue("desc")
                        },
                        "limit": 1,
                    },
                ): {"id"}
            }
        }

        client = Client()
        flow = client.graphql(query).data.flow

        # verify that a flow has been returned
        if not flow:
            raise ValueError("Flow '{}' not found.".format(flow_name))

        # grab the ID for the most recent version
        flow_id = flow[0].id
        return client.create_flow_run(flow_id=flow_id, parameters=parameters)
Beispiel #20
0
def _run_flow(
    name,
    version,
    parameters_file,
    parameters_string,
    run_name,
    watch,
    logs,
    no_url,
    project=None,
):
    if watch and logs:
        click.secho(
            "Streaming state and logs not currently supported together.",
            fg="red")
        return

    where_clause = {
        "_and": {
            "name": {
                "_eq": name
            },
            "version": {
                "_eq": version
            },
            "project": {
                "name": {
                    "_eq": project
                }
            },
        }
    }

    query = {
        "query": {
            with_args(
                "flow",
                {
                    "where": where_clause,
                    "order_by": {
                        "name": EnumValue("asc"),
                        "version": EnumValue("desc"),
                    },
                    "distinct_on": EnumValue("name"),
                },
            ): {
                "id": True
            }
        }
    }

    client = Client()
    result = client.graphql(query)

    flow_data = result.data.flow

    if flow_data:
        flow_id = flow_data[0].id
    else:
        click.secho("{} not found".format(name), fg="red")
        return

    # Load parameters from file if provided
    file_params = {}
    if parameters_file:
        with open(parameters_file) as params_file:
            file_params = json.load(params_file)

    # Load parameters from string if provided
    string_params = {}
    if parameters_string:
        string_params = json.loads(parameters_string)

    flow_run_id = client.create_flow_run(flow_id=flow_id,
                                         parameters={
                                             **file_params,
                                             **string_params
                                         },
                                         run_name=run_name)

    if no_url:
        click.echo("Flow Run ID: {}".format(flow_run_id))
    else:
        flow_run_url = client.get_cloud_url("flow-run", flow_run_id)
        click.echo("Flow Run: {}".format(flow_run_url))

    if watch:
        current_states = []
        while True:
            query = {
                "query": {
                    with_args("flow_run_by_pk", {"id": flow_run_id}): {
                        with_args(
                            "states",
                            {
                                "order_by": {
                                    EnumValue("timestamp"): EnumValue("asc")
                                }
                            },
                        ): {
                            "state": True,
                            "timestamp": True
                        }
                    }
                }
            }

            result = client.graphql(query)

            # Filter through retrieved states and output in order
            for state_index in result.data.flow_run_by_pk.states:
                state = state_index.state
                if state not in current_states:
                    if state != "Success" and state != "Failed":
                        click.echo("{} -> ".format(state), nl=False)
                    else:
                        click.echo(state)
                        return flow_run_id

                    current_states.append(state)

            time.sleep(3)

    if logs:
        all_logs = []

        log_query = {
            with_args("logs", {
                "order_by": {
                    EnumValue("timestamp"): EnumValue("asc")
                }
            }): {
                "timestamp": True,
                "message": True,
                "level": True
            },
            "start_time": True,
            "state": True,
        }

        query = {
            "query": {
                with_args(
                    "flow_run",
                    {
                        "where": {
                            "id": {
                                "_eq": flow_run_id
                            }
                        },
                        "order_by": {
                            EnumValue("start_time"): EnumValue("desc")
                        },
                    },
                ):
                log_query
            }
        }

        while True:
            result = client.graphql(query)

            flow_run = result.data.flow_run
            if not flow_run:
                click.secho("{} not found".format(flow_run_id), fg="red")
                return

            new_run = flow_run[0]
            logs = new_run.logs
            output = []

            for i in logs:
                if [i.timestamp, i.level, i.message] not in all_logs:

                    if not len(all_logs):
                        click.echo(
                            tabulate(
                                [[i.timestamp, i.level, i.message]],
                                headers=["TIMESTAMP", "LEVEL", "MESSAGE"],
                                tablefmt="plain",
                                numalign="left",
                                stralign="left",
                            ))
                        all_logs.append([i.timestamp, i.level, i.message])
                        continue

                    output.append([i.timestamp, i.level, i.message])
                    all_logs.append([i.timestamp, i.level, i.message])

            if output:
                click.echo(
                    tabulate(output,
                             tablefmt="plain",
                             numalign="left",
                             stralign="left"))

            if new_run.state == "Success" or new_run.state == "Failed":
                return flow_run_id

            time.sleep(3)

    return flow_run_id
Beispiel #21
0
def cloud(
    name,
    project,
    version,
    parameters_file,
    parameters_string,
    run_name,
    watch,
    logs,
    no_url,
):
    """
    Run a registered flow in Prefect Cloud.

    \b
    Options:
        --name, -n                  TEXT        The name of a flow to run                                       [required]
        --project, -p               TEXT        The name of a project that contains the flow                    [required]
        --version, -v               INTEGER     A flow version to run
        --parameters-file, -pf      FILE PATH   A filepath of a JSON file containing parameters
        --parameters-string, -ps    TEXT        A string of JSON parameters
        --run-name, -rn             TEXT        A name to assign for this run
        --watch, -w                             Watch current state of the flow run, stream output to stdout
        --logs, -l                              Get logs of the flow run, stream output to stdout
        --no-url                                Only output the flow run id instead of a link

    \b
    If both `--parameters-file` and `--parameters-string` are provided then the values passed
    in through the string will override the values provided from the file.

    \b
    e.g.
    File contains:  {"a": 1, "b": 2}
    String:         '{"a": 3}'
    Parameters passed to the flow run: {"a": 3, "b": 2}
    """

    if watch and logs:
        click.secho(
            "Streaming state and logs not currently supported together.",
            fg="red")
        return

    query = {
        "query": {
            with_args(
                "flow",
                {
                    "where": {
                        "_and": {
                            "name": {
                                "_eq": name
                            },
                            "version": {
                                "_eq": version
                            },
                            "project": {
                                "name": {
                                    "_eq": project
                                }
                            },
                        }
                    },
                    "order_by": {
                        "name": EnumValue("asc"),
                        "version": EnumValue("desc"),
                    },
                    "distinct_on": EnumValue("name"),
                },
            ): {
                "id": True
            }
        }
    }

    client = Client()
    result = client.graphql(query)

    flow_data = result.data.flow

    if flow_data:
        flow_id = flow_data[0].id
    else:
        click.secho("{} not found".format(name), fg="red")
        return

    # Load parameters from file if provided
    file_params = {}
    if parameters_file:
        with open(parameters_file) as params_file:
            file_params = json.load(params_file)

    # Load parameters from string if provided
    string_params = {}
    if parameters_string:
        string_params = json.loads(parameters_string)

    flow_run_id = client.create_flow_run(flow_id=flow_id,
                                         parameters={
                                             **file_params,
                                             **string_params
                                         },
                                         run_name=run_name)

    if no_url:
        click.echo("Flow Run ID: {}".format(flow_run_id))
    else:
        # Generate direct link to Cloud run
        tenant_slug = client.get_default_tenant_slug()

        url = (re.sub("api-", "", config.cloud.api)
               if re.search("api-", config.cloud.api) else re.sub(
                   "api", "cloud", config.cloud.api))

        flow_run_url = "/".join(
            [url.rstrip("/"), tenant_slug, "flow-run", flow_run_id])

        click.echo("Flow Run: {}".format(flow_run_url))

    if watch:
        current_states = []
        while True:
            query = {
                "query": {
                    with_args("flow_run_by_pk", {"id": flow_run_id}): {
                        with_args(
                            "states",
                            {
                                "order_by": {
                                    EnumValue("timestamp"): EnumValue("asc")
                                }
                            },
                        ): {
                            "state": True,
                            "timestamp": True
                        }
                    }
                }
            }

            result = client.graphql(query)

            # Filter through retrieved states and output in order
            for state_index in result.data.flow_run_by_pk.states:
                state = state_index.state
                if state not in current_states:
                    if state != "Success" and state != "Failed":
                        click.echo("{} -> ".format(state), nl=False)
                    else:
                        click.echo(state)
                        return

                    current_states.append(state)

            time.sleep(3)

    if logs:
        all_logs = []

        log_query = {
            with_args("logs", {
                "order_by": {
                    EnumValue("timestamp"): EnumValue("asc")
                }
            }): {
                "timestamp": True,
                "message": True,
                "level": True
            },
            "start_time": True,
        }

        query = {
            "query": {
                with_args(
                    "flow_run",
                    {
                        "where": {
                            "id": {
                                "_eq": flow_run_id
                            }
                        },
                        "order_by": {
                            EnumValue("start_time"): EnumValue("desc")
                        },
                    },
                ):
                log_query
            }
        }

        while True:
            result = client.graphql(query)

            flow_run = result.data.flow_run
            if not flow_run:
                click.secho("{} not found".format(flow_run_id), fg="red")
                return

            new_run = flow_run[0]
            logs = new_run.logs
            output = []

            for i in logs:
                if [i.timestamp, i.level, i.message] not in all_logs:

                    if not len(all_logs):
                        click.echo(
                            tabulate(
                                [[i.timestamp, i.level, i.message]],
                                headers=["TIMESTAMP", "LEVEL", "MESSAGE"],
                                tablefmt="plain",
                                numalign="left",
                                stralign="left",
                            ))
                        all_logs.append([i.timestamp, i.level, i.message])
                        continue

                    output.append([i.timestamp, i.level, i.message])
                    all_logs.append([i.timestamp, i.level, i.message])

            if output:
                click.echo(
                    tabulate(output,
                             tablefmt="plain",
                             numalign="left",
                             stralign="left"))

            # Check if state is either Success or Failed, exit if it is
            pk_query = {
                "query": {
                    with_args("flow_run_by_pk", {"id": flow_run_id}): {
                        "state": True
                    }
                }
            }
            result = client.graphql(pk_query)

            if (result.data.flow_run_by_pk.state == "Success"
                    or result.data.flow_run_by_pk.state == "Failed"):
                return

            time.sleep(3)
Beispiel #22
0
    def run_flow(self) -> None:
        """
        Run the flow using a Dask executor
        """
        # Call on_start callback if specified
        if self.on_start:
            self.on_start()

        try:
            from prefect.engine import get_default_flow_runner_class
            from prefect.engine.executors import DaskExecutor
            from dask_kubernetes import KubeCluster

            if self._worker_spec:
                worker_pod = self._worker_spec
                worker_pod = self._populate_worker_spec_yaml(
                    yaml_obj=worker_pod)
            else:
                with open(path.join(path.dirname(__file__),
                                    "worker_pod.yaml")) as pod_file:
                    worker_pod = yaml.safe_load(pod_file)
                    worker_pod = self._populate_worker_pod_yaml(
                        yaml_obj=worker_pod)

            cluster = KubeCluster.from_dict(
                worker_pod, namespace=prefect.context.get("namespace"))
            cluster.adapt(minimum=self.min_workers, maximum=self.max_workers)

            flow_run_id = prefect.context.get("flow_run_id")

            if not flow_run_id:
                raise ValueError("No flow run ID found in context.")

            query = {
                "query": {
                    with_args("flow_run", {
                        "where": {
                            "id": {
                                "_eq": flow_run_id
                            }
                        }
                    }): {
                        "flow": {
                            "name": True,
                            "storage": True,
                        },
                    }
                }
            }

            client = Client()
            result = client.graphql(query)
            flow_run = result.data.flow_run[0]

            flow_data = flow_run.flow
            storage_schema = prefect.serialization.storage.StorageSchema()
            storage = storage_schema.load(flow_data.storage)

            ## populate global secrets
            secrets = prefect.context.get("secrets", {})
            for secret in storage.secrets:
                secrets[secret] = prefect.tasks.secrets.PrefectSecret(
                    name=secret).run()

            with prefect.context(secrets=secrets):
                flow = storage.get_flow(storage.flows[flow_data.name])
                executor = DaskExecutor(address=cluster.scheduler_address)
                runner_cls = get_default_flow_runner_class()
                runner_cls(flow=flow).run(executor=executor)
        except Exception as exc:
            self.logger.exception(
                "Unexpected error raised during flow run: {}".format(exc))
            raise exc
        finally:
            # Call on_exit callback if specified
            if self.on_exit:
                self.on_exit()
Beispiel #23
0
    def query_flow_runs(self) -> list:
        """
        Query Prefect Cloud for flow runs which need to be deployed and executed

        Returns:
            - list: A list of GraphQLResult flow run objects
        """
        self.logger.debug("Querying for flow runs")
        # keep a copy of what was curringly running before the query (future callbacks may be
        # updating this set)
        currently_submitting_flow_runs = self.submitting_flow_runs.copy()

        # Get scheduled flow runs from queue
        mutation = {
            "mutation($input: get_runs_in_queue_input!)": {
                "get_runs_in_queue(input: $input)": {"flow_run_ids"}
            }
        }

        now = pendulum.now("UTC")
        result = self.client.graphql(
            mutation,
            variables={
                "input": {
                    "before": now.isoformat(),
                    "labels": list(self.labels),
                    "tenant_id": self.client._active_tenant_id,
                }
            },
        )

        # we queried all of the available flow runs, however, some may have already been pulled
        # by this agent and are in the process of being submitted in the background. We do not
        # want to act on these "duplicate" flow runs until we've been assured that the background
        # thread has attempted to submit the work (successful or otherwise).
        flow_run_ids = set(
            result.data.get_runs_in_queue.flow_run_ids)  # type: ignore

        if flow_run_ids:
            msg = "Found flow runs {}".format(
                result.data.get_runs_in_queue.flow_run_ids)
        else:
            msg = "No flow runs found"

        already_submitting = flow_run_ids & currently_submitting_flow_runs
        target_flow_run_ids = flow_run_ids - already_submitting

        if already_submitting:
            msg += " ({} already submitting: {})".format(
                len(already_submitting), list(already_submitting))

        self.logger.debug(msg)

        # Query metadata for flow runs found in queue
        query = {
            "query": {
                with_args(
                    "flow_run",
                    {
                        # match flow runs in the flow_run_ids list
                        "where": {
                            "id": {
                                "_in": list(target_flow_run_ids)
                            },
                            "_or": [
                                # who are EITHER scheduled...
                                {
                                    "state": {
                                        "_eq": "Scheduled"
                                    }
                                },
                                # OR running with task runs scheduled to start more than 3
                                # seconds ago
                                {
                                    "state": {
                                        "_eq": "Running"
                                    },
                                    "task_runs": {
                                        "state_start_time": {
                                            "_lte": str(
                                                now.subtract(seconds=3))  # type: ignore
                                        }
                                    },
                                },
                            ],
                        },
                    },
                ): {
                    "id": True,
                    "version": True,
                    "state": True,
                    "serialized_state": True,
                    "parameters": True,
                    "scheduled_start_time": True,
                    "flow": {
                        "id",
                        "name",
                        "environment",
                        "run_config",
                        "storage",
                        "version",
                        "core_version",
                    },
                    with_args(
                        "task_runs",
                        {
                            "where": {
                                "state_start_time": {
                                    "_lte": str(now.subtract(seconds=3))  # type: ignore
                                }
                            }
                        },
                    ): {"id", "version", "task_id", "serialized_state"},
                }
            }
        }

        if target_flow_run_ids:
            self.logger.debug("Querying flow run metadata")
            result = self.client.graphql(query)

            # Return flow runs sorted by scheduled start time
            return sorted(result.data.flow_run,
                          key=lambda flow_run: flow_run.scheduled_start_time)
        else:
            return []
Beispiel #24
0
    async def reap_zombie_task_runs(self,
                                    heartbeat_cutoff: datetime.datetime = None
                                    ) -> int:
        """
        Zombie tasks are tasks that claim to be Running, but haven't updated their heartbeat.

        This method either retries them or marks them as failed.

        Returns:
            - int: the number of zombie task runs that were handled
        """
        zombies = 0
        heartbeat_cutoff = heartbeat_cutoff or pendulum.now("utc").subtract(
            minutes=10)

        where_clause = await self.get_task_runs_where_clause(
            heartbeat_cutoff=heartbeat_cutoff)

        task_runs = await models.TaskRun.where(where_clause).get(
            selection_set={
                "id": True,
                "flow_run_id": True,
                "tenant_id": True,
                # Information about the current flow run state
                "flow_run": {"state"},
                # get information about retries from task
                "task": {"max_retries", "retry_delay"},
                # count the number of retrying states for this task run
                with_args(
                    "retry_count: states_aggregate",
                    {"where": {
                        "state": {
                            "_eq": "Retrying"
                        }
                    }},
                ): {
                    "aggregate": {"count"}
                },
            },
            limit=5000,
            order_by={"updated": EnumValue("desc")},
            apply_schema=False,
        )

        if task_runs:
            self.logger.info(
                f"Zombie killer found {len(task_runs)} task runs.")

        # Set task run states to failed
        for tr in task_runs:
            try:
                # if the flow run is running and retries are available, mark as retrying
                if (tr.flow_run.state == "Running"
                        and tr.retry_count.aggregate.count <
                    (tr.task.max_retries or 0)):
                    message = (
                        "No heartbeat detected from the remote task; retrying the run."
                        f"This will be retry {tr.retry_count.aggregate.count + 1} of {tr.task.max_retries}."
                    )
                    retry_delay = orm._as_timedelta(tr.task.retry_delay or "0")
                    await prefect.api.states.set_task_run_state(
                        task_run_id=tr.id,
                        state=Retrying(
                            message=message,
                            run_count=tr.retry_count.aggregate.count + 1,
                            start_time=pendulum.now("UTC") + retry_delay,
                        ),
                    )

                # mark failed
                else:
                    message = "No heartbeat detected from the remote task; marking the run as failed."
                    await prefect.api.states.set_task_run_state(
                        task_run_id=tr.id,
                        state=Failed(message=message),
                    )

                # log the state change to the task run
                await prefect.api.logs.create_logs(
                    [
                        dict(
                            tenant_id=tr.tenant_id,
                            flow_run_id=tr.flow_run_id,
                            task_run_id=tr.id,
                            name=f"{self.logger.name}.TaskRun",
                            message=message,
                            level="ERROR",
                        )
                    ],
                    defer_db_write=False,
                )

                zombies += 1

            except ValueError as exc:
                self.logger.error(exc)

        if zombies:
            self.logger.info(f"Addressed {zombies} zombie task runs.")

        return zombies
Beispiel #25
0
def tasks(name, flow_name, flow_version, project, limit):
    """
    Query information regarding your Prefect tasks.

    \b
    Options:
        --name, -n          TEXT    A task name to query
        --flow-name, -fn    TEXT    A flow name to query
        --flow-version, -fv INTEGER A flow version to query
        --project, -p       TEXT    The name of a project to query
        --limit, -l         INTEGER A limit amount of tasks to query, defaults to 10
    """

    where_clause = {
        "_and": {
            "name": {
                "_eq": name
            },
            "flow": {
                "name": {
                    "_eq": flow_name
                },
                "version": {
                    "_eq": flow_version
                },
            },
        }
    }

    if project:
        where_clause["_and"]["flow"]["project"] = {"name": {"_eq": project}}

    query = {
        "query": {
            with_args(
                "task",
                {
                    "where": where_clause,
                    "limit": limit,
                    "order_by": {
                        "created": EnumValue("desc")
                    },
                },
            ): {
                "name": True,
                "created": True,
                "flow": {
                    "name": True,
                    "version": True
                },
                "mapped": True,
                "type": True,
            }
        }
    }

    result = Client().graphql(query)

    task_data = result.data.task

    output = []
    for item in task_data:
        output.append([
            item.name,
            item.flow.name,
            item.flow.version,
            pendulum.parse(item.created).diff_for_humans(),
            item.mapped,
            item.type,
        ])

    click.echo(
        tabulate(
            output,
            headers=[
                "NAME", "FLOW NAME", "FLOW VERSION", "AGE", "MAPPED", "TYPE"
            ],
            tablefmt="plain",
            numalign="left",
            stralign="left",
        ))
Beispiel #26
0
def cloud(name, project, version, watch):
    """
    Run a deployed flow in Prefect Cloud.

    \b
    Options:
        --name, -n      TEXT    The name of a flow to run                                       [required]
        --project, -p   TEXT    The name of a project that contains the flow                    [required]
        --version, -v   INTEGER A flow version to run
        --watch, -w             Watch current state of the flow run, stream output to stdout
    """

    query = {
        "query": {
            with_args(
                "flow",
                {
                    "where": {
                        "_and": {
                            "name": {
                                "_eq": name
                            },
                            "version": {
                                "_eq": version
                            },
                            "project": {
                                "name": {
                                    "_eq": project
                                }
                            },
                        }
                    },
                    "order_by": {
                        "name": EnumValue("asc"),
                        "version": EnumValue("desc"),
                    },
                    "distinct_on": EnumValue("name"),
                },
            ): {
                "id": True
            }
        }
    }

    client = Client()
    result = client.graphql(query)

    flow_data = result.data.flow

    if flow_data:
        flow_id = flow_data[0].id
    else:
        click.secho("{} not found".format(name), fg="red")
        return

    flow_run_id = client.create_flow_run(flow_id=flow_id)
    click.echo("Flow Run ID: {}".format(flow_run_id))

    # TODO: Convert to using a subscription and make output prettier
    if watch:
        current_state = ""
        while True:
            query = {
                "query": {
                    with_args("flow_run_by_pk", {"id": flow_run_id}): {
                        "state": True
                    }
                }
            }

            result = client.graphql(query)

            if result.data.flow_run_by_pk.state != current_state:
                current_state = result.data.flow_run_by_pk.state
                if current_state != "Success" and current_state != "Failed":
                    click.echo("{} -> ".format(current_state), nl=False)
                else:
                    click.echo(current_state)
                    break
            time.sleep(3)
Beispiel #27
0
def flows(name, version, project, limit, all_versions):
    """
    Query information regarding your Prefect flows.

    \b
    Options:
        --name, -n      TEXT    A flow name to query
        --version, -v   TEXT    A flow version to query
        --project, -p   TEXT    The name of a project to query
        --limit, -l     INTEGER A limit amount of flows to query, defaults to 10
        --all-versions          Output all versions of a flow, default shows most recent
    """

    distinct_on = EnumValue("name")
    if all_versions:
        distinct_on = None

    where_clause = {
        "_and": {
            "name": {
                "_eq": name
            },
            "version": {
                "_eq": version
            },
        }
    }

    query_results = {
        "name": True,
        "version": True,
        "created": True,
    }

    headers = ["NAME", "VERSION", "AGE"]

    if project:
        where_clause["_and"]["project"] = {"name": {"_eq": project}}
        query_results["project"] = {"name": True}
        headers.append("PROJECT NAME")

    query = {
        "query": {
            with_args(
                "flow",
                {
                    "where": where_clause,
                    "order_by": {
                        "name": EnumValue("asc"),
                        "version": EnumValue("desc"),
                    },
                    "distinct_on": distinct_on,
                    "limit": limit,
                },
            ):
            query_results
        }
    }

    result = Client().graphql(query)

    flow_data = result.data.flow

    output = []
    for item in flow_data:
        result_output = [
            item.name,
            item.version,
            pendulum.parse(item.created).diff_for_humans(),
        ]
        if project:
            result_output.append(item.project.name, )

        output.append(result_output)

    click.echo(
        tabulate(
            output,
            headers=headers,
            tablefmt="plain",
            numalign="left",
            stralign="left",
        ))
Beispiel #28
0
    def query_flow_runs(self, tenant_id: str) -> list:
        """
        Query Prefect Cloud for flow runs which need to be deployed and executed

        Args:
            - tenant_id (str): The tenant id to use in the query

        Returns:
            - list: A list of GraphQLResult flow run objects
        """
        self.logger.debug("Querying for flow runs")

        # Get scheduled flow runs from queue
        mutation = {
            "mutation($input: getRunsInQueueInput!)": {
                "getRunsInQueue(input: $input)": {"flow_run_ids"}
            }
        }

        now = pendulum.now("UTC")
        result = self.client.graphql(
            mutation,
            variables={
                "input": {
                    "tenantId": tenant_id,
                    "before": now.isoformat()
                }
            },
        )
        flow_run_ids = result.data.getRunsInQueue.flow_run_ids  # type: ignore
        self.logger.debug("Found flow runs {}".format(flow_run_ids))

        # Query metadata fow flow runs found in queue
        query = {
            "query": {
                with_args(
                    "flow_run",
                    {
                        # match flow runs in the flow_run_ids list
                        "where": {
                            "id": {
                                "_in": flow_run_ids
                            },
                            "_or": [
                                # who are EITHER scheduled...
                                {
                                    "state": {
                                        "_eq": "Scheduled"
                                    }
                                },
                                # OR running with task runs scheduled to start more than 3 seconds ago
                                {
                                    "state": {
                                        "_eq": "Running"
                                    },
                                    "task_runs": {
                                        "state_start_time": {
                                            "_lte": str(now.subtract(seconds=3))
                                        }
                                    },
                                },
                            ],
                        }
                    },
                ): {
                    "id": True,
                    "version": True,
                    "tenant_id": True,
                    "state": True,
                    "serialized_state": True,
                    "parameters": True,
                    "flow": {"id", "name", "environment", "storage"},
                    with_args(
                        "task_runs",
                        {
                            "where": {
                                "state_start_time": {
                                    "_lte": str(now.subtract(seconds=3))
                                }
                            }
                        },
                    ): {"id", "version", "task_id", "serialized_state"},
                }
            }
        }

        self.logger.debug("Querying flow run metadata")
        result = self.client.graphql(query)
        return result.data.flow_run  # type: ignore
Beispiel #29
0
    def get_flow_run_info(self, flow_run_id: str) -> FlowRunInfoResult:
        """
        Retrieves version and current state information for the given flow run.

        Args:
            - flow_run_id (str): the id of the flow run to get information for

        Returns:
            - GraphQLResult: an object representing information about the flow run

        Raises:
            - ClientError: if the GraphQL mutation is bad for any reason
        """
        query = {
            "query": {
                with_args("flow_run_by_pk", {"id": flow_run_id}): {
                    "id": True,
                    "name": True,
                    "flow_id": True,
                    "parameters": True,
                    "context": True,
                    "version": True,
                    "scheduled_start_time": True,
                    "serialized_state": True,
                    # load all task runs except dynamic task runs
                    with_args("task_runs", {
                        "where": {
                            "map_index": {
                                "_eq": -1
                            }
                        }
                    }): {
                        "id": True,
                        "task": {
                            "id": True,
                            "slug": True
                        },
                        "version": True,
                        "serialized_state": True,
                    },
                }
            }
        }
        result = self.graphql(query).data.flow_run_by_pk  # type: ignore

        if result is None:
            raise ClientError(
                'Flow run ID not found: "{}"'.format(flow_run_id))

        # convert scheduled_start_time from string to datetime
        result.scheduled_start_time = pendulum.parse(
            result.scheduled_start_time)

        # create "state" attribute from serialized_state
        result.state = prefect.engine.state.State.deserialize(
            result.pop("serialized_state"))

        # reformat task_runs
        task_runs = []
        for tr in result.task_runs:
            tr.state = prefect.engine.state.State.deserialize(
                tr.pop("serialized_state"))
            task_info = tr.pop("task")
            tr.task_id = task_info["id"]
            tr.task_slug = task_info["slug"]
            task_runs.append(TaskRunInfoResult(**tr))

        result.task_runs = task_runs
        result.context = (result.context.to_dict()
                          if result.context is not None else None)
        result.parameters = (result.parameters.to_dict()
                             if result.parameters is not None else None)
        return FlowRunInfoResult(**result)
Beispiel #30
0
def flows(name, version, project, output):
    """
    Describe a Prefect flow.

    \b
    Options:
        --name, -n      TEXT    A flow name to query                [required]
        --version, -v   INTEGER A flow version to query
        --project, -p   TEXT    The name of a project to query
        --output, -o    TEXT    Output style, currently supports `json`.
                                Defaults to Python dictionary format.
    """

    where_clause = {
        "_and": {
            "name": {
                "_eq": name
            },
            "version": {
                "_eq": version
            },
        }
    }
    query_results = {
        "name": True,
        "version": True,
        "created": True,
        "description": True,
        "parameters": True,
        "archived": True,
        "storage": True,
        "environment": True,
    }

    if project:
        where_clause["_and"]["project"] = {"name": {"_eq": project}}
        query_results["project"] = {"name": True}

    query = {
        "query": {
            with_args(
                "flow",
                {
                    "where": where_clause,
                    "order_by": {
                        "name": EnumValue("asc"),
                        "version": EnumValue("desc"),
                    },
                    "distinct_on": EnumValue("name"),
                },
            ):
            query_results
        }
    }

    result = Client().graphql(query)

    flow_data = result.data.flow
    if flow_data:
        if output == "json":
            click.echo(json.dumps(flow_data[0]))
        else:
            click.echo(flow_data[0])
    else:
        click.secho("{} not found".format(name), fg="red")