Beispiel #1
0
    def execute(self, context: 'Context') -> int:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        if self.do_xcom_push:
            context['ti'].xcom_push(key='cluster_id', value=self.cluster_id)

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.cluster_id,
        )

        self.log.info('Modifying cluster %s', self.cluster_id)
        response = emr.modify_cluster(
            ClusterId=self.cluster_id,
            StepConcurrencyLevel=self.step_concurrency_level)

        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise AirflowException(f'Modify cluster failed: {response}')
        else:
            self.log.info('Steps concurrency level %d',
                          response['StepConcurrencyLevel'])
            return response['StepConcurrencyLevel']
Beispiel #2
0
    def execute(self, context: 'Context') -> str:
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(
                self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow creation failed: {response}')
        else:
            job_flow_id = response['JobFlowId']
            self.log.info('JobFlow with id %s created', job_flow_id)
            EmrClusterLink.persist(
                context=context,
                operator=self,
                region_name=emr.conn_region_name,
                aws_partition=emr.conn_partition,
                job_flow_id=job_flow_id,
            )
            return job_flow_id
Beispiel #3
0
class EmrModifyClusterOperator(BaseOperator):
    """
    An operator that modifies an existing EMR cluster.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:EmrModifyClusterOperator`

    :param cluster_id: cluster identifier
    :param step_concurrency_level: Concurrency of the cluster
    :param aws_conn_id: aws connection to uses
    :param do_xcom_push: if True, cluster_id is pushed to XCom with key cluster_id.
    """

    template_fields: Sequence[str] = ('cluster_id', 'step_concurrency_level')
    template_ext: Sequence[str] = ()
    ui_color = '#f9c915'
    operator_extra_links = (EmrClusterLink(), )

    def __init__(self,
                 *,
                 cluster_id: str,
                 step_concurrency_level: int,
                 aws_conn_id: str = 'aws_default',
                 **kwargs):
        if kwargs.get('xcom_push') is not None:
            raise AirflowException(
                "'xcom_push' was deprecated, use 'do_xcom_push' instead")
        super().__init__(**kwargs)
        self.aws_conn_id = aws_conn_id
        self.cluster_id = cluster_id
        self.step_concurrency_level = step_concurrency_level

    def execute(self, context: 'Context') -> int:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        if self.do_xcom_push:
            context['ti'].xcom_push(key='cluster_id', value=self.cluster_id)

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.cluster_id,
        )

        self.log.info('Modifying cluster %s', self.cluster_id)
        response = emr.modify_cluster(
            ClusterId=self.cluster_id,
            StepConcurrencyLevel=self.step_concurrency_level)

        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise AirflowException(f'Modify cluster failed: {response}')
        else:
            self.log.info('Steps concurrency level %d',
                          response['StepConcurrencyLevel'])
            return response['StepConcurrencyLevel']
Beispiel #4
0
    def execute(self, context: 'Context') -> None:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.job_flow_id,
        )

        self.log.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow termination failed: {response}')
        else:
            self.log.info('JobFlow with id %s terminated', self.job_flow_id)
Beispiel #5
0
    def execute(self, context: 'Context') -> List[str]:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(
            str(self.job_flow_name), self.cluster_states)

        if not job_flow_id:
            raise AirflowException(
                f'No cluster found for name: {self.job_flow_name}')

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=job_flow_id,
        )

        self.log.info('Adding steps to %s', job_flow_id)

        # steps may arrive as a string representing a list
        # e.g. if we used XCom or a file then: steps="[{ step1 }, { step2 }]"
        steps = self.steps
        if isinstance(steps, str):
            steps = ast.literal_eval(steps)

        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'Adding steps failed: {response}')
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
Beispiel #6
0
class EmrTerminateJobFlowOperator(BaseOperator):
    """
    Operator to terminate EMR JobFlows.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:EmrTerminateJobFlowOperator`

    :param job_flow_id: id of the JobFlow to terminate. (templated)
    :param aws_conn_id: aws connection to uses
    """

    template_fields: Sequence[str] = ('job_flow_id', )
    template_ext: Sequence[str] = ()
    ui_color = '#f9c915'
    operator_extra_links = (EmrClusterLink(), )

    def __init__(self,
                 *,
                 job_flow_id: str,
                 aws_conn_id: str = 'aws_default',
                 **kwargs):
        super().__init__(**kwargs)
        self.job_flow_id = job_flow_id
        self.aws_conn_id = aws_conn_id

    def execute(self, context: 'Context') -> None:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
        emr = emr_hook.get_conn()

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=self.job_flow_id,
        )

        self.log.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow termination failed: {response}')
        else:
            self.log.info('JobFlow with id %s terminated', self.job_flow_id)
Beispiel #7
0
class EmrAddStepsOperator(BaseOperator):
    """
    An operator that adds steps to an existing EMR job_flow.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:EmrAddStepsOperator`

    :param job_flow_id: id of the JobFlow to add steps to. (templated)
    :param job_flow_name: name of the JobFlow to add steps to. Use as an alternative to passing
        job_flow_id. will search for id of JobFlow with matching name in one of the states in
        param cluster_states. Exactly one cluster like this should exist or will fail. (templated)
    :param cluster_states: Acceptable cluster states when searching for JobFlow id by job_flow_name.
        (templated)
    :param aws_conn_id: aws connection to uses
    :param steps: boto3 style steps or reference to a steps file (must be '.json') to
        be added to the jobflow. (templated)
    :param do_xcom_push: if True, job_flow_id is pushed to XCom with key job_flow_id.
    """

    template_fields: Sequence[str] = ('job_flow_id', 'job_flow_name',
                                      'cluster_states', 'steps')
    template_ext: Sequence[str] = ('.json', )
    template_fields_renderers = {"steps": "json"}
    ui_color = '#f9c915'
    operator_extra_links = (EmrClusterLink(), )

    def __init__(
        self,
        *,
        job_flow_id: Optional[str] = None,
        job_flow_name: Optional[str] = None,
        cluster_states: Optional[List[str]] = None,
        aws_conn_id: str = 'aws_default',
        steps: Optional[Union[List[dict], str]] = None,
        **kwargs,
    ):
        if not (job_flow_id is None) ^ (job_flow_name is None):
            raise AirflowException(
                'Exactly one of job_flow_id or job_flow_name must be specified.'
            )
        super().__init__(**kwargs)
        cluster_states = cluster_states or []
        steps = steps or []
        self.aws_conn_id = aws_conn_id
        self.job_flow_id = job_flow_id
        self.job_flow_name = job_flow_name
        self.cluster_states = cluster_states
        self.steps = steps

    def execute(self, context: 'Context') -> List[str]:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(
            str(self.job_flow_name), self.cluster_states)

        if not job_flow_id:
            raise AirflowException(
                f'No cluster found for name: {self.job_flow_name}')

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        EmrClusterLink.persist(
            context=context,
            operator=self,
            region_name=emr_hook.conn_region_name,
            aws_partition=emr_hook.conn_partition,
            job_flow_id=job_flow_id,
        )

        self.log.info('Adding steps to %s', job_flow_id)

        # steps may arrive as a string representing a list
        # e.g. if we used XCom or a file then: steps="[{ step1 }, { step2 }]"
        steps = self.steps
        if isinstance(steps, str):
            steps = ast.literal_eval(steps)

        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'Adding steps failed: {response}')
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
Beispiel #8
0
class EmrCreateJobFlowOperator(BaseOperator):
    """
    Creates an EMR JobFlow, reading the config from the EMR connection.
    A dictionary of JobFlow overrides can be passed that override
    the config from the connection.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:EmrCreateJobFlowOperator`

    :param aws_conn_id: The Airflow connection used for AWS credentials.
        If this is None or empty then the default boto3 behaviour is used. If
        running Airflow in a distributed manner and aws_conn_id is None or
        empty, then default boto3 configuration would be used (and must be
        maintained on each worker node)
    :param emr_conn_id: emr connection to use for run_job_flow request body.
        This will be overridden by the job_flow_overrides param
    :param job_flow_overrides: boto3 style arguments or reference to an arguments file
        (must be '.json') to override emr_connection extra. (templated)
    :param region_name: Region named passed to EmrHook
    """

    template_fields: Sequence[str] = ('job_flow_overrides', )
    template_ext: Sequence[str] = ('.json', )
    template_fields_renderers = {"job_flow_overrides": "json"}
    ui_color = '#f9c915'
    operator_extra_links = (EmrClusterLink(), )

    def __init__(
        self,
        *,
        aws_conn_id: str = 'aws_default',
        emr_conn_id: str = 'emr_default',
        job_flow_overrides: Optional[Union[str, Dict[str, Any]]] = None,
        region_name: Optional[str] = None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.aws_conn_id = aws_conn_id
        self.emr_conn_id = emr_conn_id
        if job_flow_overrides is None:
            job_flow_overrides = {}
        self.job_flow_overrides = job_flow_overrides
        self.region_name = region_name

    def execute(self, context: 'Context') -> str:
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id,
                      region_name=self.region_name)

        self.log.info(
            'Creating JobFlow using aws-conn-id: %s, emr-conn-id: %s',
            self.aws_conn_id, self.emr_conn_id)

        if isinstance(self.job_flow_overrides, str):
            job_flow_overrides: Dict[str, Any] = ast.literal_eval(
                self.job_flow_overrides)
            self.job_flow_overrides = job_flow_overrides
        else:
            job_flow_overrides = self.job_flow_overrides
        response = emr.create_job_flow(job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException(f'JobFlow creation failed: {response}')
        else:
            job_flow_id = response['JobFlowId']
            self.log.info('JobFlow with id %s created', job_flow_id)
            EmrClusterLink.persist(
                context=context,
                operator=self,
                region_name=emr.conn_region_name,
                aws_partition=emr.conn_partition,
                job_flow_id=job_flow_id,
            )
            return job_flow_id