def _get_execution_input_from_stack(stack_name: str) -> Union[dict, None]:
    """Look for the execution input in the outputs of this stack. If present
    generate unique names for the ExecutionInput and return the dict. If not
    present return None.

    Args:
        stack_name: name of the cloudformation stack.

    Returns: ExecutionInput as a dict or None
    """
    logger.debug(f"looking for execution input in {stack_name}")
    stack = _describe_stacks(stack_name=stack_name)
    outputs = stack.get("Stacks")[0].get("Outputs")
    if outputs:
        for output in outputs:
            if (output.get("OutputKey") ==
                    DataJobExecutionInput().DATAJOB_EXECUTION_INPUT):
                execution_inputs = json.loads(output.get("OutputValue"))

                return_value = {
                    execution_input: _generate_unique_name(execution_input)
                    for execution_input in execution_inputs
                }

                console.log("execution input found: \n" f"{return_value}")
                return return_value
    logger.debug("no execution input found.")
 def add_task(self, task_other):
     """add a task to the workflow we would like to orchestrate."""
     job_name = task_other.unique_name
     logger.debug(f"adding task with name {job_name}")
     task = StepfunctionsWorkflow._create_glue_start_job_run_step(
         job_name=job_name)
     self.chain_of_tasks.append(task)
    def handle_argument_for_execution_input(
            self, datajob_stack, argument,
            unique_name) -> Union[str, ExecutionInput]:
        """If the user provided an argument we will return it as is. If the
        argument is None, hence not provided by the user, we will add it as a
        stepfunctions.ExecutionInput.

        more info here: https://aws-step-functions-data-science-sdk.readthedocs.io/en/stable/placeholders.html

        Args:
            datajob_stack: DataJob Stack instance
            argument: an argument passed to the sagemaker task by the user.

        Returns: the argument value or the execution input.
        """
        if argument is not None:
            logger.debug(
                f"parameter value {argument} is not None, we are just returning the value."
            )
            return argument
        logger.debug(
            f"no argument provided, we will construct an execution input.")
        self.add_execution_input(unique_name)
        self.update_execution_input_for_stack(datajob_stack=datajob_stack)
        return self.execution_input[unique_name]
Esempio n. 4
0
    def _deploy_glue_job_code(self, context: DataJobContext,
                              glue_job_name: str,
                              path_to_glue_job: str) -> str:
        """deploy the code of this glue job to the deployment bucket
        (can be found in the glue context object)"""
        glue_job_dir, glue_job_file_name = GlueJob._get_glue_job_dir_and_file_name(
            path_to_glue_job=path_to_glue_job)
        logger.debug(f"deploying glue job folder {glue_job_dir}")
        aws_s3_deployment.BucketDeployment(
            self,
            f"{glue_job_name}-CodeDeploy",
            sources=[
                # we can either sync dirs or zip files.
                # To keep it easy for now we agreed to sync the full dir.
                # todo - sync only the glue job itself.
                aws_s3_deployment.Source.asset(glue_job_dir)
            ],
            destination_bucket=context.deployment_bucket,
            destination_key_prefix=glue_job_name,
        )

        return GlueJob._create_s3_url_for_job(
            context=context,
            glue_job_id=glue_job_name,
            glue_job_file_name=glue_job_file_name,
        )
def _generate_unique_name(
    name: str,
    max_chars: int = MAX_CHARS,
    unique_identifier: datetime = CURRENT_DATE,
    datetime_format: str = "%Y%m%dT%H%M%S",
):
    """Generate a unique name by adding a datetime behind the name.

    Args:
        name: the name we want to make unique
        max_chars: the maximum number of characters a unique name can have.
        datetime_format: the format of the datetime that gets appended to the name,

    Returns: the name as the unique name.
    """
    current_date_as_string = unique_identifier.strftime(datetime_format)
    total_length = len(current_date_as_string) + len(name)
    difference = max_chars - total_length
    if difference < 0:
        logger.debug(
            f"the length of the unique name is {total_length}. Max chars is {max_chars}. Removing last {difference} chars from name"
        )
        name = name[:difference - 1]
    unique_name = f"{name}-{current_date_as_string}"
    logger.debug(f"generated unique name is {unique_name}")
    return unique_name
Esempio n. 6
0
 def create_resources(self) -> None:
     """create each of the resources of this stack."""
     if self.resources:
         for resource in self.resources:
             logger.debug(f"creating resource: {resource.name}")
             resource.create()
     self.create_cloudformation_outputs()
     logger.debug("no resources available to create.")
 def add_execution_input(self, unique_name: str) -> None:
     logger.debug(f"adding execution input for {unique_name}")
     if unique_name in self.execution_input_schema:
         raise DataJobSagemakerException(
             f"The entry {unique_name} already exists in the execution input."
         )
     self.execution_input_schema[unique_name] = str
     self.execution_input = ExecutionInput(
         schema=self.execution_input_schema)
 def add_parallel_tasks(self, task_others):
     """add tasks in parallel (wrapped in a list) to the workflow we would like to orchestrate."""
     deploy_pipelines = Parallel(state_id=uuid.uuid4().hex)
     for one_other_task in task_others:
         task_unique_name = one_other_task.unique_name
         logger.debug(f"adding parallel task with name {task_unique_name}")
         deploy_pipelines.add_branch(
             StepfunctionsWorkflow._create_glue_start_job_run_step(
                 job_name=task_unique_name))
     self.chain_of_tasks.append(deploy_pipelines)
Esempio n. 9
0
 def get_context_parameter(self, name: str) -> str:
     """get a cdk context parameter from the cli."""
     context_parameter = self.scope.node.try_get_context(name)
     if not context_parameter:
         raise ValueError(
             f"we expect a cdk context parameter to be set on the cli with key {name}. "
             f"e.g 'cdk deploy -c stage=my-stage' where stage is the key and my-stage is the value."
         )
     logger.debug(f"context parameter {name} found.")
     return context_parameter
 def add_parallel_tasks(self,
                        parallel_tasks: Iterator[DataJobBase]) -> Parallel:
     """add tasks in parallel (wrapped in a list) to the workflow we would
     like to orchestrate."""
     parallel_pipelines = Parallel(state_id=uuid.uuid4().hex)
     for a_task in parallel_tasks:
         logger.debug(f"adding parallel task {a_task}")
         sfn_task = self.add_task(a_task)
         parallel_pipelines.add_branch(sfn_task)
     return parallel_pipelines
Esempio n. 11
0
    def create_cloudformation_outputs(self) -> None:
        """if the outputs dictionary has key value pairs, create these for the
        cloudformation stack outputs.

        Returns:  None
        """
        if self.outputs:
            for key, value in self.outputs.items():
                logger.debug(
                    f"adding key {key} and value {value} to the stack output.")
                CfnOutput(scope=self, id=key, value=value)
Esempio n. 12
0
 def _get_glue_job_dir_and_file_name(path_to_glue_job: str) -> tuple:
     """
     Split the full path in a dir and filename.
     :param path_to_glue_job: full path to the script
     :return: full path to the dir, name of the script.
     """
     logger.debug(f"splitting path {path_to_glue_job}")
     pathlib_path_to_glue_job = Path(path_to_glue_job)
     glue_job_dir = str(pathlib_path_to_glue_job.parent)
     glue_job_file_name = pathlib_path_to_glue_job.name
     logger.debug(f"splitted into {glue_job_dir} and {glue_job_file_name}")
     return glue_job_dir, glue_job_file_name
Esempio n. 13
0
 def _get_unique_bucket_name(self):
     """if a stage is specified we use the unique_stack_name, if no stage is
     specified we return some random characters to have a high chance of a
     unique name."""
     if self.stage:
         logger.debug(
             "We have a stage therefore we have a unique name for our bucket."
         )
         return self.unique_stack_name
     logger.debug(
         "We don't have a stage, therefore we generate a random value for the bucketname."
     )
     return f"{self.unique_stack_name}-{uuid.uuid4().hex[:4]}"
 def _build_workflow(self):
     """create a step functions workflow from the chain_of_tasks."""
     logger.debug(
         f"creating a chain from all the different steps. \n {self.chain_of_tasks}"
     )
     workflow_definition = steps.Chain(self.chain_of_tasks)
     logger.debug(f"creating a workflow with name {self.unique_name}")
     self.client = boto3.client("stepfunctions")
     self.workflow = Workflow(
         name=self.unique_name,
         definition=workflow_definition,
         role=self.role.role_arn,
         client=self.client,
     )
Esempio n. 15
0
 def _create_s3_url_for_job(context: DataJobContext, glue_job_id: str,
                            glue_job_file_name: str) -> str:
     """
     construct the path to s3 where the code resides of the glue job..
     :param context: DataJobContext that contains the name of the deployment bucket.
     :param glue_job_id:
     :param glue_job_file_name:
     :return:
     """
     s3_url_glue_job = (
         f"s3://{context.deployment_bucket_name}/{glue_job_id}/{glue_job_file_name}"
     )
     logger.debug(f"s3 url for glue job {glue_job_id}: {s3_url_glue_job}")
     return s3_url_glue_job
 def build_workflow(self):
     """create a step functions workflow from the chain_of_tasks."""
     self.chain_of_tasks = self._construct_toposorted_chain_of_tasks()
     logger.debug("creating a chain from all the different steps.")
     self.chain_of_tasks = self._integrate_notification_in_workflow(
         chain_of_tasks=self.chain_of_tasks)
     logger.debug(f"creating a workflow with name {self.unique_name}")
     sfn_client = boto3.client("stepfunctions")
     self.workflow = Workflow(
         name=self.unique_name,
         definition=self.chain_of_tasks,
         role=self.role.role_arn,
         client=sfn_client,
         **self.kwargs,
     )
Esempio n. 17
0
    def __exit__(self, exc_type, exc_value, traceback):
        """steps we have to do when exiting the context manager. We execute the
        steps when no exception is present.

        - we will create the resources we have defined.
        - we will create cloudformation stack outputs, if present.

        :param exc_type:
        :param exc_value:
        :param traceback:
        :return: None
        """
        if exc_type is None and exc_value is None and traceback is None:
            logger.debug("creating resources and synthesizing stack.")
            self.create_resources()
def _find_cloudformation_stack_name_for_sfn_workflow(sfn_arn: str) -> str:
    """Find the cloudformation stackname for a stepfunction workflow.

    Args:
        sfn_arn: the AWS ARN of stepfunctions workflow

    Returns: Stackname
    """
    logger.debug(f"looking for the stack for step functions arn {sfn_arn}")
    stack_resources = _describe_stack_resources(sfn_arn=sfn_arn)
    stepfunctions_resource = [
        element for element in stack_resources.get("StackResources")
        if element.get("PhysicalResourceId") == sfn_arn
    ]
    stack_name = stepfunctions_resource[0]["StackName"]
    logger.debug(f"found stack {stack_name}")
    return stack_name
    def _integrate_notification_in_workflow(self,
                                            chain_of_tasks: Chain) -> Chain:
        """If a notification is defined we configure an SNS with email
        subscription to alert the user if the stepfunctions workflow failed or
        succeeded.

        :param chain_of_tasks: the workflow definition that contains all the steps we want to execute.
        :return: if notification is set, we adapt the workflow to include an SnsPublishStep on failure or on success.
        If notification is not set, we return the workflow as we received it.
        """
        if self.notification:
            logger.debug(
                "A notification is configured, "
                "implementing a notification on Error or when the stepfunctions workflow succeeds."
            )
            failure_notification = SnsPublishStep(
                "FailureNotification",
                parameters={
                    "TopicArn":
                    self.notification.get_topic_arn(),
                    "Message":
                    f"Stepfunctions workflow {self.unique_name} Failed.",
                },
            )
            pass_notification = SnsPublishStep(
                "SuccessNotification",
                parameters={
                    "TopicArn":
                    self.notification.get_topic_arn(),
                    "Message":
                    f"Stepfunctions workflow {self.unique_name} Succeeded.",
                },
            )

            catch_error = Catch(error_equals=["States.ALL"],
                                next_step=failure_notification)
            workflow_with_notification = Parallel(state_id="notification")
            workflow_with_notification.add_branch(chain_of_tasks)
            workflow_with_notification.add_catch(catch_error)
            workflow_with_notification.next(pass_notification)
            return Chain([workflow_with_notification])
        logger.debug(
            "No notification is configured, returning the workflow definition."
        )
        return chain_of_tasks
Esempio n. 20
0
 def _deploy_local_folder(self, include_folder: str) -> None:
     """
     deploy the contents of a local folder from our project to the deployment bucket.
     :param include_folder: path to the folder
     :return: None
     """
     logger.debug(f"deploying local folder {include_folder}")
     folder_deployment = f"{self.unique_stack_name}-FolderDeployment"
     aws_s3_deployment.BucketDeployment(
         self,
         folder_deployment,
         sources=[
             aws_s3_deployment.Source.asset(
                 str(Path(self.project_root, include_folder)))
         ],
         destination_bucket=self.deployment_bucket,
         destination_key_prefix=include_folder,
     )
Esempio n. 21
0
def _execute_packaging_logic(project_root: str, config_file: str, cmd: str) -> None:
    """
    check if the config file exists in the project root and execute the command to
    create a wheel.
    :param project_root: the path to the root of your project.
    :param config_file: the confgi file to package the project as a wheel (setup.py or pyproject.toml)
    :param cmd: the command to execute to create a wheel.
    :return: None
    """
    config_file_full_path = Path(project_root, config_file)
    logger.info(f"expecting {config_file_full_path}")
    if not config_file_full_path.is_file():
        raise DatajobPackageWheelError(
            f"no {config_file} file detected in project root {project_root}. "
            f"Hence we cannot create a python wheel for this project"
        )

    logger.debug(f"found a {config_file} file in {project_root}")
    call_subprocess(cmd=cmd)
Esempio n. 22
0
 def _create_glue_job(
     self,
     context: DataJobContext,
     glue_job_name: str,
     s3_url_glue_job: str = None,
     arguments: dict = None,
     job_type: str = "pythonshell",
     python_version: str = "3",
     glue_version: str = None,
     max_capacity: int = None,
     worker_type: str = None,
     number_of_workers: str = None,
     *args,
     **kwargs,
 ) -> None:
     """Create a glue job with the necessary configuration like,
     paths to wheel and business logic and arguments"""
     logger.debug(f"creating Glue Job {glue_job_name}")
     default_arguments = None
     if context.s3_url_wheel:
         extra_py_files = {
             # path to the wheel of this project
             "--extra-py-files": context.s3_url_wheel
         }
         default_arguments = {**extra_py_files, **arguments}
     glue.CfnJob(
         self,
         id=glue_job_name,
         name=glue_job_name,
         role=self.role.role_arn,
         command=glue.CfnJob.JobCommandProperty(
             name=job_type,
             python_version=python_version,
             script_location=s3_url_glue_job,
         ),
         glue_version=glue_version,
         max_capacity=max_capacity,
         default_arguments=default_arguments,
         worker_type=worker_type,
         number_of_workers=number_of_workers,
         *args,
         **kwargs,
     )
Esempio n. 23
0
 def _create_deployment_bucket(self, unique_stack_name: str) -> tuple:
     """
     use the unique stack name to create an s3 bucket for deployment purposes.
     We take an EmptyS3Bucket so that we can remove the stack including the deployment bucket with its contents.
     if we take a regular S3 bucket, the bucket will be orphaned from the stack leaving
     our account with all oprhaned s3 buckets.
     :param unique_stack_name: the unique stack name of the datajob stack.
     :return: s3 bucket object, name of our bucket
     """
     deployment_bucket_name = f"{unique_stack_name}-deployment-bucket"
     # todo - can we validate the bucket name?
     logger.debug(f"creating deployment bucket {deployment_bucket_name}")
     deployment_bucket = EmptyS3Bucket(
         self,
         deployment_bucket_name,
         bucket_name=deployment_bucket_name,
         removal_policy=core.RemovalPolicy.DESTROY,
     )
     return deployment_bucket, deployment_bucket_name
Esempio n. 24
0
 def get_role(self, unique_name: str, service_principal: str) -> iam.Role:
     """
     Get the default role for the datajob. We use administrator access
     as the policy for our default role.
     # todo - we probably want to refine the policies for this role
     :param unique_name: a unique name we can give to our role.
     :param service_principal: what is the service principal for our service.
     for example: glue.amazonaws.com
     :return: iam role object.
     """
     role_name = unique_name + "-role"
     logger.debug(f"creating role {role_name}")
     return iam.Role(
         self,
         role_name,
         assumed_by=iam.ServicePrincipal(service_principal),
         managed_policies=[
             iam.ManagedPolicy.from_aws_managed_policy_name("AdministratorAccess")
         ],
     )
def find_state_machine_arn(state_machine: str) -> str:
    """lookup the state machine arn based on the state machine name."""
    workflows = Workflow.list_workflows()
    state_machine_object = [
        workflow for workflow in workflows
        if workflow.get("name") == state_machine
    ]
    if len(state_machine_object) == 1:
        logger.debug(
            f"we have found one statemachine {state_machine_object[0]}")
        return state_machine_object[0].get("stateMachineArn")
    elif len(state_machine_object) == 0:
        logger.error(f"statemachine {state_machine} not found.")
        raise LookupError("no statemachine found.")
    else:
        logger.error(
            f"more than one statemachine found with name {state_machine}.")
        raise Exception(
            "more than one statemachine found. Something strange is going on ..."
        )
Esempio n. 26
0
 def _create_data_bucket(self, unique_stack_name: str) -> tuple:
     """
     use the unique stack name to create an s3 bucket for your data.
     We take an EmptyS3Bucket so that we can remove the stack including the deployment bucket with its contents.
     if we take a regular S3 bucket, the bucket will be orphaned from the stack leaving
     our account with all oprhaned s3 buckets.
     :param unique_stack_name: the unique stack name of the datajob stack.
     :return: s3 bucket object, name of our bucket
     """
     data_bucket_name = f"{unique_stack_name}"
     # todo - can we validate the bucket name?
     logger.debug(f"creating deployment bucket {data_bucket_name}")
     data_bucket = EmptyS3Bucket(
         self,
         data_bucket_name,
         bucket_name=data_bucket_name,
         # todo - we might want to refine the removal policy.
         #  Might not be wise to destroy it after we destroy the stack.
         removal_policy=core.RemovalPolicy.DESTROY,
     )
     return data_bucket, data_bucket_name
Esempio n. 27
0
 def _deploy_wheel(
     self,
     unique_stack_name: str,
     project_root: str,
     deployment_bucket: aws_s3.Bucket,
     deployment_bucket_name: str,
 ) -> str:
     """
     Create a wheel and add the .whl file to the deployment bucket.
     :param unique_stack_name: the unique stack name of the datajob stack.
     :param project_root: the absolute path to the root of a project.
     :param deployment_bucket: s3 deployment bucket object
     :param deployment_bucket_name:  s3 deployment bucket name
     :return: s3 url to the wheel we deployed onto the deployment bucket.
     """
     s3_url_wheel = None
     try:
         wheel_deployment_name = f"{unique_stack_name}-wheel"
         logger.debug(f"deploying wheel {wheel_deployment_name}")
         aws_s3_deployment.BucketDeployment(
             self,
             wheel_deployment_name,
             sources=[
                 aws_s3_deployment.Source.asset(
                     str(Path(project_root, "dist")))
             ],
             destination_bucket=deployment_bucket,
             destination_key_prefix=wheel_deployment_name,
         )
         s3_url_wheel = self._get_wheel_name(deployment_bucket_name,
                                             wheel_deployment_name,
                                             project_root)
         logger.debug(f"wheel will be located at {s3_url_wheel}")
     except DatajobContextWheelError as e:
         logger.warning("something went wrong while creating a wheel."
                        f"{e}")
         s3_url_wheel = None
     finally:
         return s3_url_wheel
Esempio n. 28
0
    def get_default_admin_role(datajob_stack: DataJobStack, unique_name: str,
                               service_principal: str) -> iam.Role:
        """Get the default role with admin rights for the datajob. We use
        administrator access as the policy for our default role.

        Args:
            datajob_stack: stack construct for this role.
            unique_name: a unique name we can give to our role.
            service_principal: what is the service principal for our service.

        Returns: iam role object.
        """
        role_name = unique_name + "-default-role"
        logger.debug(f"creating role {role_name}")
        return iam.Role(
            datajob_stack,
            role_name,
            assumed_by=iam.ServicePrincipal(service_principal),
            managed_policies=[
                iam.ManagedPolicy.from_aws_managed_policy_name(
                    "AdministratorAccess")
            ],
        )
def _connect(job):
    if isinstance(job, list):
        logger.debug(
            "we have a list, so these are jobs orchestrated in parallel.")
        _connect_parallel_job(job)
    elif isinstance(job, type(Ellipsis)):
        logger.debug("we have an ellipsis object, do nothing ...")
        return
    else:
        logger.debug("default action is to connect a single job.")
        _connect_single_job(job)
Esempio n. 30
0
 def get_stage(self, stage: str) -> Union[str, None]:
     """get the stage parameter and return a default if not found."""
     if stage:
         logger.debug(
             "a stage parameter is passed directly to the stack object, take this value."
         )
         return stage
     else:
         logger.debug(
             "check cdk context if there is not a stage value provided.")
         try:
             return self.get_context_parameter(DataJobStack.STAGE_NAME)
         except ValueError:
             logger.debug(
                 "no stage is found on the context. Will return None.")
             return None