def _get_debug_rule_status(self) -> SageMakerJobStatus: """Get the job status of the training debugging rules. Returns: SageMakerJobStatus: A status object. """ response = self._sm_client.describe_training_job( TrainingJobName=self._training_job_name) # No debugging configured if "DebugRuleEvaluationStatuses" not in response: return SageMakerJobStatus(is_completed=True, has_error=False, raw_status="") raw_status = DebugRulesStatus.from_describe(response) if raw_status != DebugRulesStatus.INPROGRESS: logging.info("Rules have ended with status:\n") self._print_debug_rule_status(response, True) return SageMakerJobStatus( is_completed=True, has_error=(raw_status == DebugRulesStatus.ERRORED), raw_status=raw_status, ) self._print_debug_rule_status(response) return SageMakerJobStatus(is_completed=False, raw_status=raw_status)
def test_get_job_status(self): self.component._sm_client = MagicMock() self.component._sm_client.describe_labeling_job.return_value = { "LabelingJobStatus": "Starting" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=False, raw_status="Starting"), ) self.component._sm_client.describe_labeling_job.return_value = { "LabelingJobStatus": "Completed" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=True, raw_status="Completed"), ) self.component._sm_client.describe_labeling_job.return_value = { "LabelingJobStatus": "Failed", "FailureReason": "lolidk", } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus( is_completed=True, raw_status="Failed", has_error=True, error_message="lolidk", ), )
def test_get_job_status(self): self.component._rm_client = MagicMock() self.component._arn = "cool-arn" self.component._rm_client.describe_simulation_application.return_value = { "arn": None } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus( is_completed=True, has_error=True, error_message="No ARN present", raw_status=None, ), ) self.component._rm_client.describe_simulation_application.return_value = { "arn": "arn:aws:robomaker:us-west-2:111111111111:simulation-application/MyRobotApplication/1551203301792" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus( is_completed=True, raw_status= "arn:aws:robomaker:us-west-2:111111111111:simulation-application/MyRobotApplication/1551203301792", ), )
def _get_job_status(self) -> SageMakerJobStatus: response = self._rm_client.describe_simulation_job(job=self._arn) status = response["status"] if status in ["Completed"]: return SageMakerJobStatus( is_completed=True, has_error=False, raw_status=status ) if status in ["Terminating", "Terminated", "Canceled"]: if "failureCode" in response: simulation_message = ( f"Simulation failed with code:{response['failureCode']}" ) return SageMakerJobStatus( is_completed=True, has_error=True, error_message=simulation_message, raw_status=status, ) else: simulation_message = "Exited without error code.\n" if "failureReason" in response: simulation_message += ( f"Simulation exited with reason:{response['failureReason']}\n" ) return SageMakerJobStatus( is_completed=True, has_error=False, error_message=simulation_message, raw_status=status, ) if status in ["Failed", "RunningFailed"]: failure_message = f"Simulation job is in status:{status}\n" if "failureReason" in response: failure_message += ( f"Simulation failed with reason:{response['failureReason']}" ) if "failureCode" in response: failure_message += ( f"Simulation failed with errorCode:{response['failureCode']}" ) return SageMakerJobStatus( is_completed=True, has_error=True, error_message=failure_message, raw_status=status, ) return SageMakerJobStatus(is_completed=False, raw_status=status)
def _get_job_status(self) -> SageMakerJobStatus: try: response = self._rm_client.describe_simulation_application( application=self._arn) status = response["arn"] if status is not None: return SageMakerJobStatus( is_completed=False, raw_status=status, ) else: return SageMakerJobStatus(is_completed=True, raw_status="Item deleted") except Exception as ex: return SageMakerJobStatus(is_completed=True, raw_status=str(ex))
def test_do_poll_handles_exceptions(self): self.component._get_job_status = MagicMock() self.component._get_job_status.side_effect = [ SageMakerJobStatus(is_completed=False, raw_status="status1"), SageMakerJobStatus(is_completed=False, raw_status="status2"), Exception("A random error occurred"), SageMakerJobStatus(is_completed=False, raw_status="don't reach"), ] self.component._after_job_complete = MagicMock() response = self.component._do(COMMON_INPUTS, DummySpec.OUTPUTS, DummySpec.OUTPUTS) self.component._after_job_complete.assert_not_called() self.assertFalse(response)
def _get_job_status(self) -> SageMakerJobStatus: response = self._sm_client.describe_training_job( TrainingJobName=self._training_job_name) status = response["TrainingJobStatus"] if status == "Completed": return self._get_debug_rule_status() if status == "Failed": message = response["FailureReason"] return SageMakerJobStatus( is_completed=True, has_error=True, error_message=message, raw_status=status, ) return SageMakerJobStatus(is_completed=False, raw_status=status)
def _get_job_status(self) -> SageMakerJobStatus: batch_response = self._rm_client.describe_simulation_job_batch( batch=self._arn) batch_status = batch_response["status"] if batch_status in ["Completed"]: return SageMakerJobStatus(is_completed=True, has_error=False, raw_status=batch_status) if batch_status in ["TimedOut", "Canceled"]: simulation_message = "Simulation jobs are completed\n" has_error = False for completed_request in batch_response["createdRequests"]: self._sim_request_ids.add( completed_request["arn"].split("/")[-1]) simulation_response = self._rm_client.describe_simulation_job( job=completed_request["arn"]) if "failureCode" in simulation_response: simulation_message += f"Simulation job: {simulation_response['arn']} failed with errorCode:{simulation_response['failureCode']}\n" has_error = True return SageMakerJobStatus( is_completed=True, has_error=has_error, error_message=simulation_message, raw_status=batch_status, ) if batch_status in ["Failed"]: failure_message = f"Simulation batch job is in status:{batch_status}\n" if "failureReason" in batch_response: failure_message += ( f"Simulation failed with reason:{batch_response['failureReason']}" ) if "failureCode" in batch_response: failure_message += ( f"Simulation failed with errorCode:{batch_response['failureCode']}" ) return SageMakerJobStatus( is_completed=True, has_error=True, error_message=failure_message, raw_status=batch_status, ) return SageMakerJobStatus(is_completed=False, raw_status=batch_status)
def test_do_polls_for_status(self): self.component._get_job_status = MagicMock() self.component._get_job_status.side_effect = [ SageMakerJobStatus(is_completed=False, raw_status="status1"), SageMakerJobStatus(is_completed=False, raw_status="status2"), SageMakerJobStatus(is_completed=True, raw_status="status3"), SageMakerJobStatus(is_completed=True, raw_status="don't reach"), ] self.component._after_job_complete = MagicMock() self.component._write_all_outputs = MagicMock() response = self.component._do(COMMON_INPUTS, DummySpec.OUTPUTS, DummySpec.OUTPUTS) self.component._after_job_complete.assert_called() self.assertTrue(response)
def test_get_job_status(self): self.component._rm_client = MagicMock() self.component._rm_client.describe_simulation_job.return_value = { "status": "Starting" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=False, raw_status="Starting"), ) self.component._rm_client.describe_simulation_job.return_value = { "status": "Downloading" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=False, raw_status="Downloading"), ) self.component._rm_client.describe_simulation_job.return_value = { "status": "Completed" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=True, raw_status="Completed"), ) self.component._rm_client.describe_simulation_job.return_value = { "status": "Failed", "failureCode": "InternalServiceError", "failureReason": "Big Reason", } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus( is_completed=True, raw_status="Failed", has_error=True, error_message= "Simulation job is in status:Failed\nSimulation failed with reason:Big ReasonSimulation failed with errorCode:InternalServiceError", ), )
def _get_job_status(self) -> SageMakerJobStatus: # Wait for endpoint creation to complete response = self._sm_client.describe_endpoint( EndpointName=self._endpoint_name) status = response["EndpointStatus"] if status == "InService": return SageMakerJobStatus(is_completed=True, has_error=False, raw_status=status) if status == "Failed": message = response["FailureReason"] return SageMakerJobStatus( is_completed=True, has_error=True, error_message=message, raw_status=status, ) return SageMakerJobStatus(is_completed=False, raw_status=status)
def _get_job_status(self) -> SageMakerJobStatus: response = self._sm_client.describe_hyper_parameter_tuning_job( HyperParameterTuningJobName=self._tuning_job_name ) status = response["HyperParameterTuningJobStatus"] if status == "Completed": return SageMakerJobStatus( is_completed=True, has_error=False, raw_status=status ) if status == "Failed": message = response["FailureReason"] return SageMakerJobStatus( is_completed=True, has_error=True, error_message=message, raw_status=status, ) return SageMakerJobStatus(is_completed=False, raw_status=status)
def test_get_job_status(self): self.component._sm_client = mock_client = MagicMock() self.component._sm_client.describe_endpoint.return_value = { "EndpointStatus": "Creating" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=False, raw_status="Creating"), ) self.component._sm_client.describe_endpoint.return_value = { "EndpointStatus": "Updating" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=False, raw_status="Updating"), ) self.component._sm_client.describe_endpoint.return_value = { "EndpointStatus": "InService" } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=True, raw_status="InService"), ) self.component._sm_client.describe_endpoint.return_value = { "EndpointStatus": "Failed", "FailureReason": "lolidk", } self.assertEqual( self.component._get_job_status(), SageMakerJobStatus( is_completed=True, raw_status="Failed", has_error=True, error_message="lolidk", ), )
def test_do_polls_for_status_catches_errors(self, mock_logging): self.component._get_job_status = MagicMock() self.component._get_job_status.side_effect = [ SageMakerJobStatus(is_completed=False, raw_status="status1"), SageMakerJobStatus(is_completed=False, raw_status="status2"), SageMakerJobStatus( is_completed=True, raw_status="status3", has_error=True, error_message="abc123", ), ] self.component._after_job_complete = MagicMock() self.component._write_all_outputs = MagicMock() response = self.component._do(COMMON_INPUTS, DummySpec.OUTPUTS, DummySpec.OUTPUTS) mock_logging.error.assert_any_call("abc123") self.component._after_job_complete.assert_not_called() self.assertFalse(response)
def _get_job_status(self) -> SageMakerJobStatus: try: response = self._rm_client.describe_simulation_application( application=self._arn) status = response["arn"] if status is not None: return SageMakerJobStatus(is_completed=True, raw_status=status) else: return SageMakerJobStatus( is_completed=True, has_error=True, error_message="No ARN present", raw_status=status, ) except Exception as ex: return SageMakerJobStatus( is_completed=True, has_error=True, error_message=str(ex), raw_status=str(ex), )
def test_wait_for_errored_rule(self): self.component._sm_client = mock_client = MagicMock() mock_client.describe_training_job.side_effect = [ { "DebugRuleEvaluationStatuses": [ { "RuleConfigurationName": "rule1", "RuleEvaluationStatus": "InProgress", }, { "RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress", }, ] }, { "DebugRuleEvaluationStatuses": [ { "RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Error" }, { "RuleConfigurationName": "rule2", "RuleEvaluationStatus": "InProgress", }, ] }, { "DebugRuleEvaluationStatuses": [ { "RuleConfigurationName": "rule1", "RuleEvaluationStatus": "Error" }, { "RuleConfigurationName": "rule2", "RuleEvaluationStatus": "NoIssuesFound", }, ] }, ] self.assertEqual( self.component._get_debug_rule_status(), SageMakerJobStatus( is_completed=False, has_error=False, raw_status=DebugRulesStatus.INPROGRESS, ), ) self.assertEqual( self.component._get_debug_rule_status(), SageMakerJobStatus( is_completed=False, has_error=False, raw_status=DebugRulesStatus.INPROGRESS, ), ) self.assertEqual( self.component._get_debug_rule_status(), SageMakerJobStatus(is_completed=True, has_error=True, raw_status=DebugRulesStatus.ERRORED), )
def _get_job_status(self) -> SageMakerJobStatus: return SageMakerJobStatus(is_completed=True, raw_status="")
def test_get_job_status(self): self.assertEqual( self.component._get_job_status(), SageMakerJobStatus(is_completed=True, raw_status="",), )