def execute(self, context): """ Executes AWS Glue Job from Airflow :return: the id of the current glue job. """ if self.script_location and not self.script_location.startswith( self.s3_protocol): s3_hook = S3Hook(aws_conn_id=self.aws_conn_id) script_name = os.path.basename(self.script_location) s3_hook.load_file(self.script_location, self.s3_bucket, self.s3_artifcats_prefix + script_name) glue_job = AwsGlueJobHook( job_name=self.job_name, desc=self.job_desc, concurrent_run_limit=self.concurrent_run_limit, script_location=self.script_location, retry_limit=self.retry_limit, num_of_dpus=self.num_of_dpus, aws_conn_id=self.aws_conn_id, region_name=self.region_name, s3_bucket=self.s3_bucket, iam_role_name=self.iam_role_name) self.log.info("Initializing AWS Glue Job: %s", self.job_name) glue_job_run = glue_job.initialize_job(self.script_args) glue_job_run = glue_job.job_completion(self.job_name, glue_job_run['JobRunId']) self.log.info("AWS Glue Job: %s status: %s. Run Id: %s", self.job_name, glue_job_run['JobRunState'], glue_job_run['JobRunId']) return glue_job_run['JobRunId']
def test_initialize_job(self, mock_get_conn, mock_get_or_create_glue_job, mock_get_job_state): some_data_path = "s3://glue-datasets/examples/medicare/SampleData.csv" some_script_arguments = {"--s3_input_data_path": some_data_path} some_script = "s3:/glue-examples/glue-scripts/sample_aws_glue_job.py" some_s3_bucket = "my-includes" mock_get_or_create_glue_job.Name = mock.Mock(Name='aws_test_glue_job') mock_get_conn.return_value.start_job_run() mock_job_run_state = mock_get_job_state.return_value glue_job_hook = AwsGlueJobHook( job_name='aws_test_glue_job', desc='This is test case job from Airflow', iam_role_name='my_test_role', script_location=some_script, s3_bucket=some_s3_bucket, region_name=self.some_aws_region, ) glue_job_run = glue_job_hook.initialize_job(some_script_arguments) glue_job_run_state = glue_job_hook.get_job_state( glue_job_run['JobName'], glue_job_run['JobRunId']) self.assertEqual(glue_job_run_state, mock_job_run_state, msg='Mocks but be equal')
def poke(self, context): hook = AwsGlueJobHook(aws_conn_id=self.aws_conn_id) self.log.info("Poking for job run status :for Glue Job %s and ID %s", self.job_name, self.run_id) job_state = hook.get_job_state(job_name=self.job_name, run_id=self.run_id) if job_state in self.success_states: self.log.info("Exiting Job %s Run State: %s", self.run_id, job_state) return True elif job_state in self.errored_states: job_error_message = "Exiting Job " + self.run_id + " Run State: " + job_state raise AirflowException(job_error_message) else: return False
def test_get_iam_execution_role(self): hook = AwsGlueJobHook(job_name='aws_test_glue_job', s3_bucket='some_bucket', iam_role_name='my_test_role') iam_role = hook.get_client_type('iam').create_role( Path="/", RoleName='my_test_role', AssumeRolePolicyDocument=json.dumps({ "Version": "2012-10-17", "Statement": { "Effect": "Allow", "Principal": {"Service": "glue.amazonaws.com"}, "Action": "sts:AssumeRole" } }) ) iam_role = hook.get_iam_execution_role() self.assertIsNotNone(iam_role)
def test_get_or_create_glue_job(self, mock_get_conn, mock_get_iam_execution_role): mock_get_iam_execution_role.return_value = \ mock.MagicMock(Role={'RoleName': 'my_test_role'}) some_script = "s3:/glue-examples/glue-scripts/sample_aws_glue_job.py" some_s3_bucket = "my-includes" mock_glue_job = mock_get_conn.return_value.get_job()['Job']['Name'] glue_job = AwsGlueJobHook(job_name='aws_test_glue_job', desc='This is test case job from Airflow', script_location=some_script, iam_role_name='my_test_role', s3_bucket=some_s3_bucket, region_name=self.some_aws_region)\ .get_or_create_glue_job() self.assertEqual(glue_job, mock_glue_job)