def test_rg_largecluster(self, rg_largecluster_input, rg_largecluster): (reference, _) = rg_largecluster assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "True", wait_periods=240) # assertions after initial creation desired_node_groups = int(rg_largecluster_input['NUM_NODE_GROUPS']) desired_replica_count = int( rg_largecluster_input['REPLICAS_PER_NODE_GROUP']) desired_total_nodes = (desired_node_groups * (1 + desired_replica_count)) resource = k8s.get_resource(reference) assert resource['status']['status'] == "available" assert len(resource['status']['nodeGroups']) == desired_node_groups assert len(resource['status']['memberClusters']) == desired_total_nodes # update, wait for resource to sync desired_node_groups = desired_node_groups - 10 desired_total_nodes = (desired_node_groups * (1 + desired_replica_count)) patch = { "spec": { "numNodeGroups": desired_node_groups, "nodeGroupConfiguration": provide_node_group_configuration(desired_node_groups) } } _ = k8s.patch_custom_resource(reference, patch) sleep( DEFAULT_WAIT_SECS ) # required as controller has likely not placed the resource in modifying assert k8s.wait_on_condition(reference, "ACK.ResourceSynced", "True", wait_periods=240) # assert new state after scaling in resource = k8s.get_resource(reference) assert resource['status']['status'] == "available" assert len(resource['status']['nodeGroups']) == desired_node_groups assert len(resource['status']['memberClusters']) == desired_total_nodes
def test_model_has_correct_arn(self, sagemaker_client, xgboost_model): (reference, _) = xgboost_model resource = k8s.get_resource(reference) model_name = resource["spec"].get("modelName", None) assert model_name is not None assert self._get_resource_model_arn( resource) == self._get_sagemaker_model_arn(sagemaker_client, model_name)
def test_config_is_deleted(self, sagemaker_client, single_variant_config): (reference, _) = single_variant_config resource = k8s.get_resource(reference) config_name = resource["spec"].get("endpointConfigName", None) # Delete the k8s resource. _, deleted = k8s.delete_custom_resource(reference) assert deleted is True assert (self._get_sagemaker_endpoint_config_arn( sagemaker_client, config_name) is None)
def test_config_has_correct_arn(self, sagemaker_client, single_variant_config): (reference, _) = single_variant_config resource = k8s.get_resource(reference) config_name = resource["spec"].get("endpointConfigName", None) assert config_name is not None assert self._get_resource_endpoint_config_arn( resource) == self._get_sagemaker_endpoint_config_arn( sagemaker_client, config_name)
def test_model_is_deleted(self, sagemaker_client, xgboost_model): (reference, _) = xgboost_model resource = k8s.get_resource(reference) model_name = resource["spec"].get("modelName", None) # Delete the k8s resource. _, deleted = k8s.delete_custom_resource(reference) assert deleted is True assert self._get_sagemaker_model_arn(sagemaker_client, model_name) is None
def test_delete_endpoint(self, sagemaker_client, single_variant_xgboost_endpoint): (reference, _, _, _) = single_variant_xgboost_endpoint resource = k8s.get_resource(reference) endpoint_name = resource["spec"].get("endpointName", None) # Delete the k8s resource. _, deleted = k8s.delete_custom_resource(reference) assert deleted is True assert ( self._describe_sagemaker_endpoint(sagemaker_client, endpoint_name) is None )
def test_model_has_correct_arn(self, sagemaker_client, xgboost_model): (reference, _) = xgboost_model resource = k8s.get_resource(reference) model_name = resource['spec'].get('modelName', None) assert model_name is not None resource_model_arn = self._get_resource_model_arn(resource) expected_model_arn = self._get_sagemaker_model_arn( sagemaker_client, model_name) assert resource_model_arn == expected_model_arn
def test_trainingjob_has_created_status(self, sagemaker_client, xgboost_trainingjob): (reference, _) = xgboost_trainingjob resource = k8s.get_resource(reference) trainingjob_name = resource['spec'].get('trainingJobName', None) assert trainingjob_name is not None current_trainingjob_status = self._get_sagemaker_trainingjob_status( sagemaker_client, trainingjob_name) expected_trainingjob_status_list = self._get_created_trainingjob_status_list( ) assert current_trainingjob_status in expected_trainingjob_status_list
def test_trainingjob_has_correct_arn(self, sagemaker_client, xgboost_trainingjob): (reference, _) = xgboost_trainingjob resource = k8s.get_resource(reference) trainingjob_name = resource['spec'].get('trainingJobName', None) assert trainingjob_name is not None resource_trainingjob_arn = self._get_resource_trainingjob_arn(resource) expected_trainingjob_arn = self._get_sagemaker_trainingjob_arn( sagemaker_client, trainingjob_name) assert resource_trainingjob_arn == expected_trainingjob_arn
def test_processing_job_has_created_status(self, sagemaker_client, kmeans_processing_job): (reference, _) = kmeans_processing_job resource = k8s.get_resource(reference) processing_job_name = resource["spec"].get("processingJobName", None) assert processing_job_name is not None current_processing_job_status = self._get_sagemaker_processing_job_status( sagemaker_client, processing_job_name) expected_processing_job_status_list = ( self._get_created_processing_job_status_list()) assert current_processing_job_status in expected_processing_job_status_list
def test_processing_job_has_correct_arn(self, sagemaker_client, kmeans_processing_job): (reference, _) = kmeans_processing_job resource = k8s.get_resource(reference) processing_job_name = resource["spec"].get("processingJobName", None) assert processing_job_name is not None resource_processing_job_arn = k8s.get_resource_arn(resource) expected_processing_job_arn = self._get_sagemaker_processing_job_arn( sagemaker_client, processing_job_name) assert resource_processing_job_arn == expected_processing_job_arn
def test_processing_job_has_stopped_status(self, sagemaker_client, kmeans_processing_job): (reference, _) = kmeans_processing_job resource = k8s.get_resource(reference) processing_job_name = resource["spec"].get("processingJobName", None) assert processing_job_name is not None # Delete the k8s resource. _, deleted = k8s.delete_custom_resource(reference) assert deleted is True current_processing_job_status = self._get_sagemaker_processing_job_status( sagemaker_client, processing_job_name) expected_processing_job_status_list = ( self._get_stopped_processing_job_status_list()) assert current_processing_job_status in expected_processing_job_status_list
def test_trainingjob_has_stopped_status(self, sagemaker_client, xgboost_trainingjob): (reference, _) = xgboost_trainingjob resource = k8s.get_resource(reference) trainingjob_name = resource['spec'].get('trainingJobName', None) assert trainingjob_name is not None # Delete the k8s resource. k8s.delete_custom_resource(reference) # TODO: This sleep could be replaced by a wait loop but this is sufficient for now. time.sleep(5) current_trainingjob_status = self._get_sagemaker_trainingjob_status( sagemaker_client, trainingjob_name) expected_trainingjob_status_list = self._get_stopped_trainingjob_status_list( ) assert current_trainingjob_status in expected_trainingjob_status_list
def test_trainingjob_has_stopped_status( self, sagemaker_client, xgboost_trainingjob ): (reference, _) = xgboost_trainingjob resource = k8s.get_resource(reference) trainingjob_name = resource["spec"].get("trainingJobName", None) assert trainingjob_name is not None # Delete the k8s resource. _, deleted = k8s.delete_custom_resource(reference) assert deleted is True current_trainingjob_status = self._get_sagemaker_trainingjob_status( sagemaker_client, trainingjob_name ) expected_trainingjob_status_list = self._get_stopped_trainingjob_status_list() assert current_trainingjob_status in expected_trainingjob_status_list
def _wait_resource_endpoint_status( self, reference: k8s.CustomResourceReference, expected_status: str, wait_periods: int = 18, ): resource_status = None for _ in range(wait_periods): time.sleep(30) resource = k8s.get_resource(reference) assert "endpointStatus" in resource["status"] resource_status = resource["status"]["endpointStatus"] if resource_status == expected_status: break else: logging.error( f"Wait for endpoint resource status: {expected_status} timed out. Actual status: {resource_status}" ) return resource_status
def test_endpoint_has_correct_arn_and_status( self, sagemaker_client, single_variant_xgboost_endpoint ): (reference, _, _, _) = single_variant_xgboost_endpoint resource = k8s.get_resource(reference) endpoint_name = resource["spec"].get("endpointName", None) assert endpoint_name is not None assert ( self._get_resource_endpoint_arn(resource) == self._describe_sagemaker_endpoint(sagemaker_client, endpoint_name)[ "EndpointArn" ] ) self._assert_endpoint_status_in_sync( sagemaker_client, endpoint_name, reference, self.status_creating ) self._assert_endpoint_status_in_sync( sagemaker_client, endpoint_name, reference, self.status_inservice )