Beispiel #1
0
    def start_sync_run(
        self,
        sync_id: int,
        wait_for_completion: bool,
        wait_time_between_api_calls: int,
        max_wait_time: int,
    ) -> Dict:
        """
        Start a new sync run.
        Optionally, wait for run completion.
        The sync run is triggered via the
        [Start new sync run API](https://hightouch.io/docs/syncs/api/#start-a-new-sync-run).

        Args:
            - sync_id (int): The sync identifier.
            - wait_for_completion (bool): Whether to wait for the sync run completion or not.
            - wait_time_between_api_calls (int): The number of seconds to wait between API calls.
                This is used only if `wait_for_completion` is `True`.
            - max_wait_time (int): The maximum number of seconds to wait for the sync run to complete.
                This is used only if `wait_for_completion` is `True`.

        Raises:
            - `prefect.engine.signals.FAIL` if the sync run takes more
                than `max_wait_time` seconds to complete.

        Returns:
            - If `wait_for_completion` is `True`, returns the JSON response
                containing the status of the sync run.
            - If `wait_for_completion` is `False`, returns the JSON response
                containing information about the start sync run action.
        """
        url = f"{self.__HIGHTOUCH_START_NEW_SYNC_RUN_URL}/{sync_id}"
        with self.session.post(url) as response:
            if response.status_code != 200:
                msg = f"Error while starting sync run. Error is: {response.reason}."
                raise FAIL(message=msg)

            start_sync_response = response.json()
            if wait_for_completion:
                elapsed_wait_time = 0
                sync_status = None
                while not max_wait_time or elapsed_wait_time <= max_wait_time:

                    sync_status_response = self.get_sync_run_status(
                        sync_id=sync_id)
                    sync_status = sync_status_response["sync"]["sync_status"]

                    if sync_status == "success":
                        return sync_status_response
                    else:
                        time.sleep(wait_time_between_api_calls)
                        elapsed_wait_time += wait_time_between_api_calls

                msg = "Sync run exceeded `max_wait_time`"
                raise FAIL(message=msg)

            else:
                return start_sync_response
Beispiel #2
0
    def run(
        self,
        job_name: str = None,
        job_definition: str = None,
        job_queue: str = None,
        batch_kwargs: dict = None,
        credentials: str = None,
    ):
        """
        Submit a job to the AWS Batch job service.

        Args:
            - job_name (str, optional): The AWS batch job name.
            - job_definition (str, optional): The AWS batch job definition.
            - job_queue (str, optional): Name of the AWS batch job queue.
            - batch_kwargs (dict, optional): Additional keyword arguments to pass to the boto3
                `submit_job` function. See the [submit_job](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html#Batch.Client.submit_job)  # noqa
                documentation for more details.
            - credentials (dict, optional): your AWS credentials passed from an upstream
                Secret task; this Secret must be a JSON string
                with two keys: `ACCESS_KEY` and `SECRET_ACCESS_KEY` which will be
                passed directly to `boto3`.  If not provided here or in context, `boto3`
                will fall back on standard AWS rules for authentication.
        """
        if not job_name:
            raise ValueError("A job name must be provided.")

        if not job_definition:
            raise ValueError("A job definition must be provided.")

        if not job_queue:
            raise ValueError("A job queue must be provided.")

        if not batch_kwargs:
            batch_kwargs = {}

        batch_client = get_boto_client("batch",
                                       credentials=credentials,
                                       **self.boto_kwargs)

        try:
            response = batch_client.submit_job(
                jobName=job_name,
                jobQueue=job_queue,
                jobDefinition=job_definition,
                **batch_kwargs,
            )
        except Exception as e:
            self.logger.error("Failed to submit job", exc_info=True)
            raise FAIL(
                f"Failed to submit job '{job_name}' to AWS Batch.") from e

        if not response.get("jobId"):
            raise FAIL(
                f"AWS Batch submit response contains no job ID: {response}")

        return response["jobId"]
Beispiel #3
0
    def run(
        self,
        bucket_name: str = None,
        blob: str = None,
        project: str = None,
        wait_seconds: int = 0,
        fail_if_not_found: bool = True,
        credentials: dict = None,
        request_timeout: Union[float, Tuple[float, float]] = 60,
    ) -> str:
        """
        Run method for this Task. Invoked by _calling_ this Task after initialization
        within a Flow context.

        Note that some arguments are required for the task to run, and must be
        provided _either_ at initialization _or_ as arguments.

        Args:
            - bucket_name (str, optional): the bucket to check
            - blob (str, optional): object for which to search within the bucket
            - project (str, optional): default Google Cloud project to work within.
                If not provided, will be inferred from your Google Cloud credentials
            - wait_seconds(int, optional): retry until file is found or until wait_seconds,
                whichever is first.  Defaults to 0
            - fail_if_not_found (bool, optional):  Will raise Fail signal on task if
                blob is not found.  Defaults to True
            - credentials (dict, optional): a JSON document containing Google Cloud credentials.
                You should provide these at runtime with an upstream Secret task.  If not
                provided, Prefect will first check `context` for `GCP_CREDENTIALS` and lastly
                will use default Google client logic.
            - request_timeout (Union[float, Tuple[float, float]], optional): the number of
                seconds the transport should wait for the server response.
                Can also be passed as a tuple (connect_timeout, read_timeout).

        Returns:
            - bool: the object exists

        Raises:
            - ValueError: if `bucket_name` or `blob` are missing
            - FAIL: if object not found and fail_if_not_found is True

        """
        if None in [bucket_name, blob]:
            raise ValueError("Missing bucket_name or blob")

        # create client
        client = get_storage_client(project=project, credentials=credentials)

        bucket = client.bucket(bucket_name)
        blob_exists = None

        wait, n = 0, 1
        while wait <= wait_seconds and not blob_exists:
            sleep(n)
            wait += n
            n *= 2
            blob_exists = storage.Blob(bucket=bucket, name=blob).exists(client)
        if fail_if_not_found and not blob_exists:
            raise FAIL(message="Blob not found")
        return blob_exists
Beispiel #4
0
    def test_submission_fail(self, batch_client):
        batch_client.submit_job = MagicMock(side_effect=FAIL())

        task = BatchSubmit(job_definition="job_def",
                           job_name="job_name",
                           job_queue="queue123")
        with pytest.raises(FAIL, match="Failed to submit job 'job_name'"):
            task.run()
    def run(
        self,
        container_id: str = None,
        docker_server_url: str = "unix:///var/run/docker.sock",
        raise_on_exit_code: bool = True,
    ) -> None:
        """
        Task run method.

        Args:
            - container_id (str, optional): The id of a container to wait on
            - docker_server_url (str, optional): URL for the Docker server. Defaults to
                `unix:///var/run/docker.sock` however other hosts such as `tcp://0.0.0.0:2375`
                can be provided
            - raise_on_exit_code (bool, optional): whether to raise a `FAIL`
                signal for a nonzero exit code; defaults to `True`

        Returns:
            - dict: a dictionary with `StatusCode` and `Error` keys

        Raises:
            - ValueError: if `container_id` is `None`
            - FAIL: if `raise_on_exit_code` is `True` and the container exits
                with a nonzero exit code
        """
        if not container_id:
            raise ValueError("A container id must be provided.")

        # 'import docker' is expensive time-wise, we should do this just-in-time to keep
        # the 'import prefect' time low
        import docker

        self.logger.debug(
            "Starting to wait on container with id {}".format(container_id)
        )
        client = docker.APIClient(base_url=docker_server_url, version="auto")

        result = client.wait(container=container_id)
        if raise_on_exit_code and (
            (result.get("Error") is not None) or result.get("StatusCode")
        ):
            try:
                logs = client.logs(container_id)
                self.logger.error(logs.decode())
            except Exception as exc:
                self.logger.exception(exc)
            raise FAIL(
                "{id} failed with exit code {code}: {msg}".format(
                    id=container_id,
                    code=result.get("StatusCode"),
                    msg=result.get("Error"),
                )
            )
        self.logger.debug(
            "Completed waiting on container with id {}".format(container_id)
        )
        return result
Beispiel #6
0
    def _get_data_from_url(self, api_url: str, params: Dict) -> Dict:
        """
        Retrieve data from a Cube.js API.

        Args:
            - api_url (str): The URL of the Cube API to call.
            - params (dict): Parameters to be passed to the API call.

        Raises:
            - `prefect.engine.signals.FAIL` if the response has `status_code != 200`.
            - `prefect.engine.signals.FAIL` if the REST APIs takes too long to respond,
                with regards to `max_wait_time`.

        Returns:
            - Cube.js REST API JSON response
        """
        session = Session()
        session.headers = {
            "Content-type": "application/json",
            "Authorization": self.api_token,
        }
        elapsed_wait_time = 0
        while not self.max_wait_time or elapsed_wait_time <= self.max_wait_time:

            with session.get(url=api_url, params=params) as response:
                if response.status_code == 200:
                    data = response.json()

                    if "error" in data.keys(
                    ) and "Continue wait" in data["error"]:
                        time.sleep(self.wait_api_call_secs)
                        elapsed_wait_time += self.wait_api_call_secs
                        continue

                    else:
                        return data

                else:
                    raise FAIL(
                        message=
                        f"Cube.js load API failed! Error is: {response.reason}"
                    )
        msg = f"Cube.js load API took longer than {self.max_wait_time} seconds to provide a response."
        raise FAIL(message=msg)
Beispiel #7
0
    def run(
        self,
        container_id: str = None,
        docker_server_url: str = "unix:///var/run/docker.sock",
        raise_on_exit_code: bool = True,
        extra_docker_kwargs: dict = None,
    ) -> None:
        """
        Task run method.

        Args:
            - container_id (str, optional): The id of a container to wait on
            - docker_server_url (str, optional): URL for the Docker server. Defaults to
                `unix:///var/run/docker.sock` however other hosts such as `tcp://0.0.0.0:2375`
                can be provided
            - raise_on_exit_code (bool, optional): whether to raise a `FAIL`
                signal for a nonzero exit code; defaults to `True`
            - extra_docker_kwargs (dict, optional): Extra keyword arguments to pass through to the
                Docker call (cf. method `wait`). See
                https://docker-py.readthedocs.io/en/stable/api.html for more details

        Returns:
            - dict: a dictionary with `StatusCode` and `Error` keys

        Raises:
            - ValueError: if `container_id` is `None`
            - FAIL: if `raise_on_exit_code` is `True` and the container exits
                with a nonzero exit code
        """
        if not container_id:
            raise ValueError("A container id must be provided.")

        # 'import docker' is expensive time-wise, we should do this just-in-time to keep
        # the 'import prefect' time low
        import docker

        self.logger.debug(f"Waiting on container {container_id}")
        client = docker.APIClient(base_url=docker_server_url, version="auto")

        result = client.wait(container=container_id,
                             **(extra_docker_kwargs or dict()))
        if raise_on_exit_code and ((result.get("Error") is not None)
                                   or result.get("StatusCode")):
            try:
                logs = client.logs(container_id)
                self.logger.error(logs.decode())
            except Exception as exc:
                self.logger.exception(exc)
            raise FAIL("{id} failed with exit code {code}: {msg}".format(
                id=container_id,
                code=result.get("StatusCode"),
                msg=result.get("Error"),
            ))
        self.logger.debug(f"Container {container_id} has finished")

        return result
Beispiel #8
0
    def get_sync_run_status(self, sync_id: int) -> Dict:
        """
        Return the status of a sync run.
        The status is obtained by calling the
        [Get sync run status API](https://hightouch.io/docs/syncs/api/#get-the-status-of-a-sync-run).

        Args:
            - sync_id (int): The sync identifier.

        Raises:
            - `prefect.engine.signals.FAIL` if the response status code is not 200.

        Returns:
            - The JSON response containing information about the status of
                the sync run.
        """
        url = f"{self.__HIGHTOUCH_GET_SYNC_RUN_STATUS}/{sync_id}"
        with self.session.get(url) as response:
            if response.status_code != 200:
                msg = f"Error while retrieving sync run status. Error is: {response.reason}."
                raise FAIL(message=msg)

            return response.json()
Beispiel #9
0
    def run(
        self,
        api_key: str = None,
        api_key_env_var: str = None,
        mql_server_url: str = None,
        mql_server_url_env_var: str = None,
        model_key_id: int = None,
        materialization_name: str = None,
        start_time: str = None,
        end_time: str = None,
        output_table: str = None,
        force: bool = False,
        wait_for_creation: bool = True,
    ):
        """
        Task run method to create a materialization against a Transform metrics
        layer deployment.
        All parameters can be provided either during task initialization or directly
        in this `run` method.

        Args:
            - api_key (str, optional): Transform API Key to be used to
                connect to Transform MQL Server.
            - api_key_env_var (str, optional): The name of the environment variable
                that contains the API Key to be used to connect to Transform MQL Server.
            - mql_server_url (str, optional): The URL of the Transform MQL Server
                from which to create the materialization.
            - mql_server_url_env_var (str, optional): The name of the environment variable
                that contains the URL of the Transform MQL Server from which to
                create the materialization.
            - model_key_id (int, optional): The unique identifier of the Transform model
                against which the transformation will be created.
            - materialization_name (str, optional): The name of the Transform materialization
                to create.
            - start_time (str, optional): The UTC start time of the materialization.
            - end_time (str, optional): The ISO end time of the materialization.
            - output_table (str, optional): The name of the database table, in the form of
                `schema_name.table_name`, where the materialization will be created.
            - force (bool, optional): Whether to force the materialization creation
                or not. Defaults to `False`.
            - wait_for_creation (bool, optional): Whether to wait for the materialization creation
                or not. Defaults to `True`.

        Raises:
            - `ValueError` if both `api_key` and `api_key_env_var` are missing.
            - `ValueError` if both `mql_server_url` and `mql_server_url_env_var` are missing.
            - `ValueError` if `materialization_name` is missing.
            - `prefect.engine.signals.FAIL` if the connection with the Transform server cannot
                be established.
            - `prefect.engine.signals.FAIL` if the materialization creation process fails.

        Returns:
            - An `MqlQueryStatusResp` object if `run_async` is `True`.
            - An `MqlMaterializeResp` object if `run_async` is `False`.

        """
        # Raise error if both api_key and api_key_env_var are missing
        if not (api_key or api_key_env_var):
            msg = "Both `api_key` and `api_key_env_var` are missing."
            raise ValueError(msg)

        # Raise error if api_key is missing and env var is not found
        if not api_key and api_key_env_var not in os.environ.keys():
            msg = "`api_key` is missing and `api_key_env_var` not found in env vars."
            raise ValueError(msg)

        mql_api_key = api_key or os.environ[api_key_env_var]

        # Raise error if both mql_server_url and mql_server_url_env_var are missing
        if not (mql_server_url or mql_server_url_env_var):
            msg = "Both `mql_server_url` and `mql_server_url_env_var` are missing."
            raise ValueError(msg)

        # Raise error if mql_server_url is missing and env var is not found
        if not mql_server_url and mql_server_url_env_var not in os.environ.keys(
        ):
            msg = "`mql_server_url` is missing and `mql_server_url_env_var` not found in env vars."
            raise ValueError(msg)

        mql_url = mql_server_url or os.environ[mql_server_url_env_var]

        if not materialization_name:
            msg = "`materialization_name` is missing."
            raise ValueError(msg)

        use_async = not wait_for_creation

        try:
            mql_client = MQLClient(api_key=mql_api_key,
                                   mql_server_url=mql_url,
                                   use_async=use_async)
        except (AuthException, URLException) as e:
            msg = f"Cannot connect to Transform server! Error is: {e.msg}"
            raise FAIL(message=msg)

        response = None
        if use_async:
            response = mql_client.create_materialization(
                materialization_name=materialization_name,
                start_time=start_time,
                end_time=end_time,
                model_key_id=model_key_id,
                output_table=output_table,
                force=force,
            )
            if response.is_failed:
                msg = f"Transform materialization async creation failed! Error is: {response.error}"
                raise FAIL(message=msg)
        else:
            try:
                response = mql_client.materialize(
                    materialization_name=materialization_name,
                    start_time=start_time,
                    end_time=end_time,
                    model_key_id=model_key_id,
                    output_table=output_table,
                    force=force,
                )
            except QueryRuntimeException as e:
                msg = (
                    f"Transform materialization sync creation failed! Error is: {e.msg}"
                )
                raise FAIL(message=msg)

        return response
Beispiel #10
0
    def run(
        self,
        uri: str = None,
        dataset_id: str = None,
        table: str = None,
        project: str = None,
        schema: List[bigquery.SchemaField] = None,
        location: str = "US",
        credentials: dict = None,
        **kwargs,
    ):
        """
        Run method for this Task.  Invoked by _calling_ this Task within a Flow context, after
        initialization.

        Args:
            - uri (str, optional): GCS path to load data from
            - dataset_id (str, optional): the id of a destination dataset to write the
                records to; if not provided here, will default to the one provided at initialization
            - table (str, optional): the name of a destination table to write the
                records to; if not provided here, will default to the one provided at initialization
            - project (str, optional): the project to initialize the BigQuery Client with; if
                not provided, will default to the one inferred from your credentials
            - schema (List[bigquery.SchemaField], optional): the schema to use when creating
                the table
            - location (str, optional): location of the dataset that will be written to;
                defaults to "US"
            - credentials (dict, optional): a JSON document containing Google Cloud
                credentials.  You should provide these at runtime with an upstream Secret task.
                If not provided, Prefect will first check `context` for `GCP_CREDENTIALS` and
                lastly will use default Google client logic.
            - **kwargs (optional): additional kwargs to pass to the `bigquery.LoadJobConfig`;
                see the documentation here:
                https://googleapis.github.io/google-cloud-python/latest/bigquery/generated/google.cloud.bigquery.client.Client.html

        Raises:
            - ValueError: if all required arguments haven't been provided
            - ValueError: if the load job results in an error

        Returns:
            - google.cloud.bigquery.job.LoadJob: the response from `load_table_from_uri`
        """
        # check for any argument inconsistencies
        if dataset_id is None or table is None:
            raise ValueError("Both dataset_id and table must be provided.")

        # create client
        client = get_bigquery_client(project=project, credentials=credentials)

        # get table reference
        table_ref = client.dataset(dataset_id).table(table)

        # load data
        autodetect = kwargs.pop("autodetect", True)
        job_config = bigquery.LoadJobConfig(autodetect=autodetect, **kwargs)
        if schema:
            job_config.schema = schema
        try:
            load_job = client.load_table_from_uri(
                uri,
                table_ref,
                location=location,
                job_config=job_config,
            ).result()  # block until job is finished
        except Exception as exception:
            for error in load_job.errors:
                self.logger(error)
            raise FAIL(exception)
        # remove unpickleable attributes
        load_job._client = None
        load_job._completion_lock = None

        return load_job
Beispiel #11
0
def third_task():
    from prefect.engine.signals import FAIL
    raise FAIL(message=my_logger("This is sensitive data"))
Beispiel #12
0
def transform(data):
    """Multiply the input by 10"""
    raise FAIL("I am a failure")
Beispiel #13
0
def trythis(x, fail=False):
    if fail:
        raise FAIL()
    return [k + 1 for k in x]
Beispiel #14
0
    def run(
        self,
        airbyte_server_host: str = None,
        airbyte_server_port: int = None,
        airbyte_api_version: str = None,
        connection_id: str = None,
        poll_interval_s: int = 15,
    ) -> dict:
        """
        Task run method for triggering an Airbyte Connection.

        *It is assumed that the user will have previously configured
        a Source & Destination into a Connection.*
        e.g. MySql -> CSV

        An invocation of `run` will attempt to start a sync job for
        the specified `connection_id` representing the Connection in
        Airbyte.

        `run` will poll Airbyte Server for the Connection status and
        will only complete when the sync has completed or
        when it receives an error status code from an API call.

        Args:
            - airbyte_server_host (str, optional): Hostname of Airbyte server where connection is
                configured. Will overwrite the value provided at init if provided.
            - airbyte_server_port (str, optional): Port that the Airbyte server is listening on.
                Will overwrite the value provided at init if provided.
            - airbyte_api_version (str, optional): Version of Airbyte API to use to trigger connection
                sync. Will overwrite the value provided at init if provided.
            - connection_id (str, optional): if provided,
                will overwrite the value provided at init.
            - poll_interval_s (int, optional): this task polls the
                Airbyte API for status, if provided this value will
                override the default polling time of 15 seconds.

        Returns:
            - dict: connection_id (str) and succeeded_at (timestamp str)
        """
        if not connection_id:
            raise ValueError("Value for parameter `connection_id` *must* \
            be provided.")

        try:
            uuid.UUID(connection_id)
        except (TypeError, ValueError):
            raise ValueError(
                "Parameter `connection_id` *must* be a valid UUID \
                i.e. 32 hex characters, including hyphens.")

        # see https://airbyte-public-api-docs.s3.us-east-2.amazonaws.com
        # /rapidoc-api-docs.html#overview
        airbyte_base_url = (f"http://{airbyte_server_host}:"
                            f"{airbyte_server_port}/api/{airbyte_api_version}")

        session = requests.Session()
        self._check_health_status(session, airbyte_base_url)
        self.logger.info(
            f"Getting Airbyte Connection {connection_id}, poll interval "
            f"{poll_interval_s} seconds, airbyte_base_url {airbyte_base_url}")

        connection_status = self._get_connection_status(
            session, airbyte_base_url, connection_id)
        if connection_status == self.CONNECTION_STATUS_ACTIVE:
            # Trigger manual sync on the Connection ...
            job_id, job_created_at = self._trigger_manual_sync_connection(
                session, airbyte_base_url, connection_id)

            job_status = self.JOB_STATUS_PENDING

            while job_status not in [
                    self.JOB_STATUS_FAILED, self.JOB_STATUS_SUCCEEDED
            ]:
                job_status, job_created_at, job_updated_at = self._get_job_status(
                    session, airbyte_base_url, job_id)

                # pending┃running┃incomplete┃failed┃succeeded┃cancelled
                if job_status == self.JOB_STATUS_SUCCEEDED:
                    self.logger.info(f"Job {job_id} succeeded.")
                elif job_status == self.JOB_STATUS_FAILED:
                    self.logger.error(f"Job {job_id} failed.")
                else:
                    # wait for next poll interval
                    sleep(poll_interval_s)

            return {
                "connection_id": connection_id,
                "status": connection_status,
                "job_status": job_status,
                "job_created_at": job_created_at,
                "job_updated_at": job_updated_at,
            }
        elif connection_status == self.CONNECTION_STATUS_INACTIVE:
            self.logger.error(
                f"Please enable the Connection {connection_id} in Airbyte Server."
            )
            raise FAIL(
                f"Please enable the Connection {connection_id} in Airbyte Server."
            )
        elif connection_status == self.CONNECTION_STATUS_DEPRECATED:
            self.logger.error(f"Connection {connection_id} is deprecated.")
            raise FAIL(f"Connection {connection_id} is deprecated.")
Beispiel #15
0
    def run(
        self,
        client: str = None,
        waiter_name: str = None,
        waiter_definition: dict = None,
        waiter_kwargs: dict = None,
        credentials: str = None,
    ):
        """
        Task for waiting on a long-running AWS job. Uses the underlying boto3 waiter functionality.

        Args:
            - client (str): The AWS client on which to wait (e.g., 'batch', 'ec2', etc)
            - waiter_name (str, optional): The name of the waiter to instantiate. Can be a boto-supported
                waiter or one of prefect's custom waiters. Currently, prefect offers three additional
                waiters for AWS Batch: `"JobExists"` waits for a job to be instantiated, `"JobRunning"`
                waits for a job to start running, and `"JobComplete"` waits for a job to finish. You can
                find the definitions for all prefect-defined waiters [here](https://github.com/PrefectHQ/prefect/tree/master/src/prefect/tasks/aws/waiters).  # noqa
                You may also use a custom waiter name, if you supply an accompanying waiter definition
                dict.
            - waiter_definition (dict, optional): A valid custom waiter model, as a dict. Note that if
                you supply a custom definition, it is assumed that the provided 'waiter_name' is
                contained within the waiter definition dict.
            - waiter_kwargs (dict, optional): Arguments to pass to the `waiter.wait(...)` method. Will
                depend upon the specific waiter being called.
            - credentials (dict, optional): your AWS credentials passed from an upstream
                Secret task; this Secret must be a JSON string
                with two keys: `ACCESS_KEY` and `SECRET_ACCESS_KEY` which will be
                passed directly to `boto3`.  If not provided here or in context, `boto3`
                will fall back on standard AWS rules for authentication.
        """
        if not client:
            raise ValueError("An AWS client string must be provided.")

        if not waiter_name:
            raise ValueError("A waiter name must be provided.")

        if not waiter_kwargs:
            waiter_kwargs = {}

        boto_client = get_boto_client(client,
                                      credentials=credentials,
                                      **self.boto_kwargs)
        if waiter_definition:
            # Use user-provided waiter definition
            waiter_model = WaiterModel(waiter_definition)
            waiter = create_waiter_with_client(waiter_name, waiter_model,
                                               boto_client)
        else:
            # Use either boto-provided or prefect-provided waiter
            if waiter_name in boto_client.waiter_names:
                waiter = boto_client.get_waiter(waiter_name)
            else:
                waiter = self._load_prefect_waiter(boto_client, client,
                                                   waiter_name)

        try:
            waiter.wait(**waiter_kwargs)
        except WaiterError as e:
            raise FAIL(
                f"AWS {client} waiter '{waiter_name}' failed with: {str(e)}"
            ) from e
Beispiel #16
0
    def run(
        self,
        api_secret: str = None,
        api_secret_env_var: str = None,
        from_date: str = None,
        to_date: str = None,
        limit: int = None,
        event: Union[str, List[str]] = None,
        where: str = None,
        parse_response: bool = False,
        use_eu_server: bool = False,
        group_events: bool = False,
    ):
        """
        Task run method to request a data export from Mixpanel using the Export API.

        Args:
            - api_secret (str, optional): The API secret key to use to authenticate
                to Mixpanel. Can be provided also via env var.
            - api_secret_env_var (str, optional): The name of the env var that contains
                the API secret key to use to authenticate to Mixpanel.
                `api_secret` takes precedence over `api_secret_env_var`.
            - from_date (str, optional): Start date of the export request.
                If provided as a string, it should be in the format `YYYY-MM-DD`.
                Default value is `2011-07-10`. This date is inclusive.
            - to_date (str, optional): End date of the export request.
                If provided as a string, it should be in the format `YYYY-MM-DD`.
                Default value is `prefect.context.today`. This date is inclusive.
            - limit (int, optional): The max number of events to return.
            - event (str, list, optional): The event, or events, that you wish
                to get the data for.
            - where (str, optional): An expression to filter events by.
                More info on expression sequence structure can be found
                at https://developer.mixpanel.com/reference/segmentation-expressions.
            - parse_response (bool, optional): Whether to parse the response into a JSON object.
                Default value is `False`.
            - use_eu_server (bool, optional): Whether to use the Mixpanel EU server to retrieve data.
                More info at
                https://help.mixpanel.com/hc/en-us/articles/360039135652-Data-Residency-in-EU.
                Default is `False`.
            - group_events: Whether to group events with the same name.
                This is taken into account only if `parse_response is True`.

        Returns:
            - if `parse_response` is False, then returns a `str` response pulled
                from the Export API, (which is basically a JSONL string).
            - if `parse_response` is True and `group_events` is True, then returns a `dict` where
                each key contains homogeneous events.
            - if `parse_response` is True and `group_events` is False, then returns
                a `list` of JSON objects obtained by parsing the response.

        Raises:
            - `ValueError` if both `api_secret` and `api_secret_env_var` are missing.
            - `ValueError` if `api_secret` is missing and `api_secret_env_var` is not found.
            - `prefect.engine.signals.FAIL` if the Mixpanel API returns an error.

        """
        if not api_secret and not api_secret_env_var:
            raise ValueError(
                "Missing both `api_secret` and `api_secret_env_var`.")
        elif not api_secret and api_secret_env_var not in os.environ:
            raise ValueError(
                "Missing `api_secret` and `api_secret_env_var` not found.")

        secret = None
        if api_secret:
            self.logger.debug("Got secret from `api_secret`")
            secret = api_secret
        else:
            self.logger.debug(
                "Got secret from env var passed from `api_secret_env_var`")
            secret = os.environ[api_secret_env_var]

        params = {"from_date": from_date, "to_date": to_date}

        if limit:
            params["limit"] = limit

        if event:
            params["event"] = json.dumps(
                [event] if isinstance(event, str) else event)

        if where:
            params["where"] = where

        url = "https://{server}.mixpanel.com/api/2.0/export".format(
            server="data-eu" if use_eu_server else "data")

        response = requests.get(
            url=url,
            auth=HTTPBasicAuth(secret, ""),
            headers={"Accept": "application/json"},
            params=params,
        )

        if response.status_code != 200:
            msg = f"""
            Mixpanel export API error.
            Status code: {response.status_code}
            Reason: {response.reason}
            Text: {response.text}
            """
            raise FAIL(message=msg)

        events = response.text

        if not events:
            return None
        elif parse_response:
            received_events = [
                json.loads(event) for event in events.splitlines()
            ]
            if group_events:
                grouped_events = defaultdict(list)
                for received_event in received_events:
                    grouped_events[received_event["event"]].append(
                        received_event["properties"])
                return dict(grouped_events)
            return received_events
        else:
            return events
Beispiel #17
0
    def run(
        self,
        subdomain: str = None,
        url: str = None,
        api_secret: str = None,
        api_secret_env_var: str = "CUBEJS_API_SECRET",
        query: Union[Dict, List[Dict]] = None,
        security_context: Union[str, Dict] = None,
        wait_time_between_api_calls: int = 10,
        max_wait_time: int = None,
    ):
        """
        Task run method to perform a query using Cube.js load API.

        Args:
            - subdomain (str, optional): The subdomain to use to get the data.
                If provided, `subdomain` takes precedence over `url`.
                This is likely to be useful to Cube Cloud users.
            - url (str, optional): The URL to use to get the data.
                This is likely to be useful to users of self-hosted Cube.js.
            - api_secret (str, optional): The API secret used to generate an
                API token for authentication.
                If provided, it takes precedence over `api_secret_env_var`.
            - api_secret_env_var (str, optional): The name of the env var that contains
                the API secret to use to generate an API token for authentication.
                Defaults to `CUBEJS_API_SECRET`.
            - query (dict, list, optional): `dict` or `list` representing
                valid Cube.js queries.
                If you pass multiple queries, then be aware of Cube.js Data Blending.
                More info at https://cube.dev/docs/rest-api#api-reference-v-1-load
                and at https://cube.dev/docs/schema/advanced/data-blending.
                Query format can be found at: https://cube.dev/docs/query-format.
            - security_context (str, dict, optional): The security context to use
                during authentication.
                If the security context does not contain an expiration period,
                then a 7-day expiration period is added automatically.
                More info at https://cube.dev/docs/security/context.
            - wait_time_between_api_calls (int, optional): The number of seconds to
                wait between API calls.
                Default to 10.
            - max_wait_time (int, optional): The number of seconds to wait for the
                Cube.js load API to return a response.

        Raises:
            - ValueError if both `subdomain` and `url` are missing.
            - ValueError if `api_token` is missing and `api_token_env_var` cannot be found.
            - ValueError if `query` is missing.
            - `prefect.engine.signals.FAIL` if the Cube.js load API fails.
            - `prefect.engine.signals.FAIL` if the Cube.js load API takes more than
                `max_wait_time` seconds to respond.

        Returns:
            - The Cube.js JSON response.

        """

        if not subdomain and not url:
            raise ValueError("Missing both `subdomain` and `url`.")

        if not api_secret and api_secret_env_var not in os.environ:
            raise ValueError(
                "Missing `api_secret` and `api_secret_env_var` not found.")

        if not query:
            raise ValueError("Missing `query`.")

        cube_base_url = self.__CUBEJS_CLOUD_BASE_URL
        if subdomain:
            cube_base_url = f"{cube_base_url.format(subdomain=subdomain)}/cubejs-api"
        else:
            cube_base_url = url
        query_api_url = f"{cube_base_url}/v1/load"

        self.logger.debug(f"Query URL: {query_api_url}")

        secret = api_secret if api_secret else os.environ[api_secret_env_var]

        if security_context:

            extended_context = security_context
            if "exp" not in security_context and "expiresIn" not in security_context:
                extended_context["expiresIn"] = "7d"
            api_token = jwt.encode(payload=extended_context,
                                   key=secret,
                                   algorithm="HS256")

            self.logger.debug("JWT token generated with security context.")

        else:
            api_token = jwt.encode(payload={}, key=secret)

        session = Session()
        session.headers = {
            "Content-type": "application/json",
            "Authorization": api_token,
        }

        params = {"query": json.dumps(query)}

        wait_api_call_secs = (wait_time_between_api_calls
                              if wait_time_between_api_calls > 0 else 10)
        elapsed_wait_time = 0
        while not max_wait_time or elapsed_wait_time <= max_wait_time:

            with session.get(url=query_api_url, params=params) as response:
                self.logger.debug(f"URL is: {response.url}")

                if response.status_code == 200:
                    data = response.json()

                    if "error" in data.keys(
                    ) and "Continue wait" in data["error"]:
                        msg = (
                            "Cube.js load API still running."
                            "Waiting {wait_api_call_secs} seconds before retrying"
                        )
                        self.logger.info(msg)
                        time.sleep(wait_api_call_secs)
                        elapsed_wait_time += wait_api_call_secs
                        continue

                    else:
                        return data

                else:
                    raise FAIL(
                        message=
                        f"Cube.js load API failed! Error is: {response.reason}"
                    )

        raise FAIL(
            message=
            f"Cube.js load API took longer than {max_wait_time} seconds to provide a response."
        )
Beispiel #18
0
    def run(
        self,
        subdomain: str = None,
        email_address: str = None,
        api_token: str = None,
        api_token_env_var: str = None,
        start_time: Union[int, datetime] = None,
        cursor: str = None,
        exclude_deleted: bool = None,
        include_entities: List[str] = None,
    ):
        """
        Task run method to perform an incremental export of tickets from Zendesk.
        Args:
            - subdomain (str, optional): The Zendesk subdomain to use to export tickets.
            - email_address (str, optional): The email address to use to authenticate on Zendesk.
            - api_token (str, optional): The API token to use to athenticate on Zendesk
                If passed, it will take precedence over `api_token_env_var`.
            - api_token_env_var (str, optional): The name of the env var which contains the
                API token to use to authenticate on Zendesk.
            - start_time (int, datetime, optional): The start time to use to export tickets.
                Can be passed as an epoch timestamp or a `datetime` object.
            - cursor (str, optional): The cursor to use to export tickets.
                If passed, it will take precedence over `start_time`.
            - exclude_deleted: (bool, optional): Whether to exclude deleted tickets or not.
                Default to `False`.
            - include_entities: (str, list, optional): Optional list of entities to side load.
                More info at
                https://developer.zendesk.com/documentation/ticketing/using-the-zendesk-api/side_loading/.

        Raises:
            - `ValueError` if both `api_token` and `api_token_env_var` are missing.
            - `ValueError` if `api_token` is missing and `api_token_env_var` cannot be found.
            - `ValueError` if `subdomain` is missing.
            - `ValueError` if `email_address` is missing.
            - `ValueError` if both `start_time` and `cursor` are missing.
            - `prefect.engine.signals.FAIL` if the Zendesk API call fails.

        Returns:
            - A `dict` containing the list of tickets and, optionally, the included
              entities.
        """
        if not api_token and not api_token_env_var:
            raise ValueError(
                "Both `api_token` and `api_token_env_var` are missing.")

        if not api_token and api_token_env_var not in os.environ:
            raise ValueError(
                "`api_token` is missing and `api_token_env_var` not found.")

        token = None
        if api_token:
            token = api_token
        elif api_token_env_var:
            token = os.environ[api_token_env_var]

        if not subdomain:
            raise ValueError("`subdomain` is missing.")

        if not email_address:
            raise ValueError("`email_address` is missing.")

        if not start_time and not cursor:
            raise ValueError("Both `start_time` and `cursor` are missing.")

        base_url = self._ZENDESK_API_BASE_URL.format(subdomain=subdomain)
        export_url = f"{base_url}/incremental/tickets/cursor.json"

        if cursor:
            self.logger.debug("Got cursor")
            export_url = f"{export_url}?cursor={cursor}"

        elif start_time:
            self.logger.debug("Got start_time")
            start_datetime = (start_time if isinstance(start_time, int) else
                              int(start_time.timestamp()))
            export_url = f"{export_url}?start_time={start_datetime}"

        if exclude_deleted:
            export_url = f"{export_url}&exclude_deleted=true"

        if include_entities:
            if isinstance(include_entities, str):
                include_entities_str = include_entities
            elif isinstance(include_entities, list):
                include_entities_str = ",".join(list(set(include_entities)))

            export_url = f"{export_url}&include={include_entities_str}"

        session = requests.Session()
        session.auth = f"{email_address}/token", token

        end_of_stream = False

        tickets = defaultdict(list)

        while not end_of_stream:
            with session.get(export_url) as response:
                self.logger.debug(f"Export URL is: {export_url}")

                if response.status_code == 429:
                    retry_after_seconds = int(response.headers["retry-after"])
                    self.logger.warning(f"""
                        API rate limit reached!
                        Waiting for {retry_after_seconds} seconds before retrying.
                        """)
                    time.sleep(retry_after_seconds + 1)
                    continue

                elif response.status_code != 200:
                    msg = f"""
                    Zendesk API call failed!
                    Status: {response.status_code}
                    Reason: {response.reason}
                    """
                    raise FAIL(message=msg)

                content = response.json()

                tickets["tickets"].extend(content["tickets"])

                if include_entities:
                    for include_entity in list(set(include_entities)):
                        if include_entity in content.keys():
                            tickets[include_entity].extend(
                                content[include_entity])

                end_of_stream = content["end_of_stream"]
                export_url = content["after_url"]
                cursor = content["after_cursor"]

            if not end_of_stream:
                # Try to avoid the rate limit: 10 requests per minute
                time.sleep(0.1)

        return tickets
Beispiel #19
0
 def do_something():
     raise FAIL("test")
Beispiel #20
0
def i_am_bad():
    raise FAIL("Oh no!")
Beispiel #21
0
    def run(
        self,
        server_uri: str = None,
        user: str = None,
        password: str = None,
        db_name: str = None,
        server_uri_env_var: str = None,
        user_env_var: str = None,
        password_env_var: str = None,
        db_name_env_var: str = None,
        cypher_query: str = None,
        return_result_as: str = __DEFAULT_RETURN_RESULT_TYPE,
    ):
        """
        Task run method to run a Cypher query against Neo4j.

        Args:
            - server_uri (str, optional): The Neo4j URI to connect to.
                More information regarding the accepted forms for `server_uri`
                can be found at https://py2neo.org/2021.1/profiles.html.
                This parameter, if provided, takes precedence over
                `server_uri_env_var`.
            - user (str, optional): The user to use to connect
                to Neo4j.
                This parameter, if provided, takes precedence over
                `user_env_var`.
            - password (str, optional): The password to use to connect
                to Neo4j.
                This parameter, if provided, takes precedence over
                `password_env_var`.
            - db_name: (str, optional): The database name where the Cypher query
                will run.
                This parameter, if provided, takes precedence over
                `db_name_env_var`.
            - server_uri_env_var (str, optional): The name of the environment variable
                that contains the Neo4j server URI to connect to.
            - user_env_var (str, optional): The name of the environment variable
                that contains the user to use to connect to Neo4j.
            - password_env_var (str, optional): The name of the environment variable
                that contains the password to use to connect to Neo4j.
            - db_name_env_var (str, optional): The name of the environment variable
                that contains the database name where the Cypher query will run.
            - cypher_query (str, optional): The Cypher query to run.
                More information about the Cypher query language, can be
                found at https://neo4j.com/developer/cypher/.
            - return_result_as (str, optional): How to return the result.
                Accepted values are `raw`, `dataframe`.
                Defaults to `raw` (which will return a `list` of `dict`).
                Applies only when the query returned result is not empty.

        Returns:
            - `None` if the query result is empty.
            - The original result if `return_result_as` is `raw`.
            - A `pandas.DataFrame` if `return_result_as` is `dataframe`.

        Raises:
            - `ValueError` if both `server_uri` and `server_uri_env_var`
                are `None`.
            - `ValueError` if `server_uri` is `None` and `server_uri_env_var`
                is not found.
            - `ValueError` if both `user` and `user_env_var`
                are `None`.
            - `ValueError` if `user` is `None` and `user_env_var` is
                not found.
            - `ValueError` if both `password` and `password_env_var`
                are not found.
            - `ValueError` if `password` is `None` and `password_env_var`
                is not found.
            - `ValueError` if `db_name` is `None` and `db_name_env_var`
                is not found.
            - `ValueError` if `cypher_query` is `None`.
            - `ValueError` if `return_result_as` is not one of
                `raw`, `dataframe`.
            - `prefect.engine.signals.FAIL` if any error occurs while establishing
                the connection with Neo4j.
            - `prefect.engine.signals.FAIL` if any error occurs while running
                the Cypher query.
        """
        if not server_uri and not server_uri_env_var:
            msg = "Please provide either the `server_uri` or the `server_uri_env_var`."
            raise ValueError(msg)

        if not server_uri and server_uri_env_var not in os.environ.keys():
            msg = f"`{server_uri_env_var}` not found in environment variables."
            raise ValueError(msg)

        neo4j_uri = server_uri or os.environ[server_uri_env_var]

        if not user and not user_env_var:
            msg = "Please provide either the `user` or the `user_env_var`."
            raise ValueError(msg)

        if not user and user_env_var not in os.environ.keys():
            msg = f"`{user_env_var}` not found in environment variables."
            raise ValueError(msg)

        neo4j_user = user or os.environ[user_env_var]

        if not password and not password_env_var:
            msg = "Please provide either the `password` or the `password_env_var`."
            raise ValueError(msg)

        if not password and password_env_var not in os.environ.keys():
            msg = f"`{password_env_var}` not found in environment variables."
            raise ValueError(msg)

        neo4j_password = password or os.environ[password_env_var]

        neo4j_db_name = None
        if db_name:
            neo4j_db_name = db_name
        elif db_name_env_var and db_name_env_var not in os.environ.keys():
            msg = f"`{db_name_env_var}` not found in environment variables."
            raise ValueError(msg)
        elif db_name_env_var and db_name_env_var in os.environ.keys():
            neo4j_db_name = os.environ[db_name_env_var]

        if not cypher_query:
            raise ValueError("Please provide a value for `cypher_query`.")

        if return_result_as not in self.__ACCEPTED_RETURN_RESULT_TYPES:
            msg = f"Illegal value for `return_result_as`. Illegal value is: {return_result_as}."
            raise ValueError(msg)

        try:
            graph = Graph(profile=neo4j_uri,
                          name=neo4j_db_name,
                          auth=(neo4j_user, neo4j_password))
        except ConnectionUnavailable as e:
            msg = f"Error while connecting to Neo4j. Exception: {str(e)}"
            raise FAIL(message=msg)

        try:
            r = graph.run(cypher_query)
        except ClientError as e:
            msg = f"Error while running Cypher query. Exception: {str(e)}"
            raise FAIL(message=msg)

        result = r.data()

        if not result:
            return None
        elif return_result_as == "dataframe":
            return r.to_data_frame()
        else:
            return result