def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        _log.info('Poking step {0} on cluster {1}'.format(
            self.step_id, self.job_flow_id))
        return emr.describe_step(ClusterId=self.job_flow_id,
                                 StepId=self.step_id)
Beispiel #2
0
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Poking step %s on cluster %s', self.step_id,
                      self.job_flow_id)
        return emr.describe_step(ClusterId=self.job_flow_id,
                                 StepId=self.step_id)
Beispiel #3
0
    def test_create_job_flow_uses_the_emr_config_to_create_a_cluster(self):
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        cluster = hook.create_job_flow({'Name': 'test_cluster'})

        self.assertEqual(client.list_clusters()['Clusters'][0]['Id'],
                         cluster['JobFlowId'])
    def test_create_job_flow_uses_the_emr_config_to_create_a_cluster(self):
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        cluster = hook.create_job_flow({'Name': 'test_cluster'})

        self.assertEqual(client.list_clusters()['Clusters'][0]['Id'],
                         cluster['JobFlowId'])
Beispiel #5
0
 def describe_step(self, clusterid: str, stepid: str) -> dict:
     """
     Return the transform job info associated with the name
     :param clusterid: EMR Cluster ID
     :type stepid: str: StepID
     :return: A dict contains all the transform job info
     """
     emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)
     emr = emr_hook.get_conn()
     return emr.describe_step(ClusterId=clusterid, StepId=stepid)
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        logging.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow termination failed: %s' % response)
        else:
            logging.info('JobFlow with id %s terminated', self.job_flow_id)
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Terminating JobFlow %s', self.job_flow_id)
        response = emr.terminate_job_flows(JobFlowIds=[self.job_flow_id])

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow termination failed: %s' % response)
        else:
            self.log.info('JobFlow with id %s terminated', self.job_flow_id)
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Adding steps to %s', self.job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=self.job_flow_id, Steps=self.steps)

        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
            raise AirflowException('Adding steps failed: %s' % response)
        self.log.info('Steps %s added to JobFlow', response['StepIds'])
        return response['StepIds']
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id, emr_conn_id=self.emr_conn_id)

        logging.info('Creating JobFlow')
        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            logging.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Adding steps to %s', self.job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=self.job_flow_id, Steps=self.steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('Adding steps failed: %s' % response)
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        logging.info("Adding steps to %s", self.job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=self.job_flow_id, Steps=self.steps)

        if not response["ResponseMetadata"]["HTTPStatusCode"] == 200:
            raise AirflowException("Adding steps failed: %s" % response)
        else:
            logging.info("Steps %s added to JobFlow", response["StepIds"])
            return response["StepIds"]
Beispiel #12
0
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id,
                      emr_conn_id=self.emr_conn_id)

        _log.info('Creating JobFlow')
        response = emr.create_job_flow(self.job_flow_overrides)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('JobFlow creation failed: %s' % response)
        else:
            _log.info('JobFlow with id %s created', response['JobFlowId'])
            return response['JobFlowId']
    def execute(self, context):
        attempt = context['ti'].try_number
        logging.info('attempt: {}'.format(attempt))
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        job_flow_id = self.job_flow_id

        if not job_flow_id:
            job_flow_id = emr.get_cluster_id_by_name(self.job_flow_name,
                                                     self.cluster_states)

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        step_name = self.step_name if attempt == 1 else "{} (attempt {})".format(
            self.step_name, attempt)

        action_on_failure = self.action_on_failure
        if attempt % 3 == 0:
            action_on_failure = 'TERMINATE_JOB_FLOW'

        spark_conf = self.get_spark_params_config(self.spark_params,
                                                  self.spark_conf)

        steps = self.generate_spark_step(step_name, self.main_class,
                                         self.app_name, spark_conf,
                                         self.application_args, self.jar_path,
                                         action_on_failure)
        logging.info("spark_params: " + str(steps))

        self.log.info('Adding steps to %s', job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=steps)

        logging.info('Running Spark Job {} with JobFlow ID {}'.format(
            self.task_id, self.job_flow_id))
        while True:
            step_id = response['StepIds'][0]
            logging.info('step id - {}'.format(step_id))
            result = self.describe_step(emr, response)
            step_status = result['Step']['Status']['State']
            logging.info('step status - {}'.format(step_status))
            # step state can be 'PENDING'|'CANCEL_PENDING'|'RUNNING'|'COMPLETED'|'CANCELLED'|'FAILED'|'INTERRUPTED'
            if step_status == 'COMPLETED':
                break
            elif step_status != 'COMPLETED' and step_status != 'PENDING' and step_status != 'RUNNING':
                raise AirflowException('Spark job {} has failed'.format(
                    self.task_id))

            logging.info("Spark Job '{}' status is {}".format(
                self.task_id, step_status))
Beispiel #14
0
    def test_get_cluster_id_by_name(self):
        """
        Test that we can resolve cluster id by cluster name.
        """
        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')

        job_flow = hook.create_job_flow({'Name': 'test_cluster',
                                         'Instances': {'KeepJobFlowAliveWhenNoSteps': True}})

        job_flow_id = job_flow['JobFlowId']

        matching_cluster = hook.get_cluster_id_by_name('test_cluster', ['RUNNING', 'WAITING'])

        self.assertEqual(matching_cluster, job_flow_id)

        no_match = hook.get_cluster_id_by_name('foo', ['RUNNING', 'WAITING', 'BOOTSTRAPPING'])

        self.assertIsNone(no_match)
Beispiel #15
0
    def execute(self, context):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        job_flow_id = self.job_flow_id

        if not job_flow_id:
            job_flow_id = emr.get_cluster_id_by_name(self.job_flow_name, self.cluster_states)

        if self.do_xcom_push:
            context['ti'].xcom_push(key='job_flow_id', value=job_flow_id)

        self.log.info('Adding steps to %s', job_flow_id)
        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=self.steps)

        if not response['ResponseMetadata']['HTTPStatusCode'] == 200:
            raise AirflowException('Adding steps failed: %s' % response)
        else:
            self.log.info('Steps %s added to JobFlow', response['StepIds'])
            return response['StepIds']
Beispiel #16
0
    def test_create_job_flow_extra_args(self):
        """
        Test that we can add extra arguments to the launch call.

        This is useful for when AWS add new options, such as
        "SecurityConfiguration" so that we don't have to change our code
        """
        client = boto3.client('emr', region_name='us-east-1')

        hook = EmrHook(aws_conn_id='aws_default', emr_conn_id='emr_default')
        # AmiVersion is really old and almost no one will use it anymore, but
        # it's one of the "optional" request params that moto supports - it's
        # coverage of EMR isn't 100% it turns out.
        cluster = hook.create_job_flow({'Name': 'test_cluster',
                                        'ReleaseLabel': '',
                                        'AmiVersion': '3.2'})

        cluster = client.describe_cluster(ClusterId=cluster['JobFlowId'])['Cluster']

        # The AmiVersion comes back as {Requested,Running}AmiVersion fields.
        self.assertEqual(cluster['RequestedAmiVersion'], '3.2')
Beispiel #17
0
    def execute(self, context: Dict[str, Any]) -> List[str]:
        emr_hook = EmrHook(aws_conn_id=self.aws_conn_id)

        emr = emr_hook.get_conn()

        job_flow_id = self.job_flow_id or emr_hook.get_cluster_id_by_name(
            str(self.job_flow_name), self.cluster_states
        )

        if not job_flow_id:
            raise AirflowException(f"No cluster found for name: {self.job_flow_name}")

        if self.do_xcom_push:
            context["ti"].xcom_push(key="job_flow_id", value=job_flow_id)

        self.log.info("Adding steps to %s", job_flow_id)

        # steps may arrive as a string representing a list
        # e.g. if we used XCom or a file then: steps="[{ step1 }, { step2 }]"
        steps = self.steps
        if isinstance(steps, str):
            steps = ast.literal_eval(steps)

        response = emr.add_job_flow_steps(JobFlowId=job_flow_id, Steps=steps)

        if not response["ResponseMetadata"]["HTTPStatusCode"] == 200:
            raise AirflowException("Adding steps failed: %s" % response)
        else:
            # Assumption : ONly a single step is submitted each time.
            step_ids = response["StepIds"]
            step_id = step_ids[0]
            if self.wait_for_completion:
                self.check_status(
                    job_flow_id,
                    step_id,
                    self.describe_step,
                    self.check_interval,
                )
            self.log.info("Steps %s added to JobFlow", response["StepIds"])
            return response["StepIds"]
Beispiel #18
0
    def ensure_cluster_exists(**kwargs):
        try:
            response = EmrHook().get_conn().list_clusters(
                ClusterStates=['STARTING', 'RUNNING', 'WAITING'])

            matching_clusters = list(
                filter(lambda cluster: cluster['Name'] == CLUSTER_NAME,
                       response['Clusters']))
            if (len(matching_clusters) >= 1):
                print('cluster is created already!')
                cluster_id = matching_clusters[0]['Id']
                kwargs['ti'].xcom_push(key='clusterid', value=cluster_id)
                return 'parse_id'
            else:
                print('cluster does not exist!')
                return 'cluster_creator'

        except Exception as e:
            print('Error:', e)
            return 'cluster_creator'
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.logger.info('Poking cluster %s', self.job_flow_id)
        return emr.describe_cluster(ClusterId=self.job_flow_id)
Beispiel #20
0
def client(clientID):
    global emr
    emr = EmrHook.get_connection(clientID)
 def test_get_conn_returns_a_boto3_connection(self):
     hook = EmrHook(aws_conn_id='aws_default')
     self.assertIsNotNone(hook.get_conn().list_clusters())
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        self.log.info('Poking step %s on cluster %s', self.step_id, self.job_flow_id)
        return emr.describe_step(ClusterId=self.job_flow_id, StepId=self.step_id)
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        _log.info('Poking step {0} on cluster {1}'.format(self.step_id, self.job_flow_id))
        return emr.describe_step(ClusterId=self.job_flow_id, StepId=self.step_id)
    def get_emr_response(self):
        emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()

        logging.info('Poking cluster %s' % self.job_flow_id)
        return emr.describe_cluster(ClusterId=self.job_flow_id)
 def get_emr_response(self):
     emr = EmrHook(aws_conn_id=self.aws_conn_id).get_conn()
     self.log.info('Poking notebook execution %s', self.notebook_execution_id)
     return emr.describe_notebook_execution(NotebookExecutionId=self.notebook_execution_id)