def resize_pool_and_check_for_resize_errors( self, pool: batchmodels.CloudPool, batch_service_client: batch.BatchExtensionsClient) -> bool: """ Resizes a pool to double the current number of dedicated nodes and waits to check it resizes correctly. If resize still fails the JobStatus is set to POOL_FAILED and False is returned. :param pool: The pool to attempt to resize :type CloudPool: `azure.batch.models.CloudPool` :return: Returns true if the pool resized correctly, otherwise false. :rtype bool """ new_node_count = pool.target_dedicated_nodes * 2 logger.info("Resizing pool [{}] to node count {}".format( self.pool_id, new_node_count)) batch_service_client.pool.resize(self.pool_id, target_dedicated_nodes=new_node_count) pool = batch_service_client.pool.get(self.pool_id) while pool.allocation_state.value == "resizing" and self.check_time_has_expired( timeout): time.sleep(10) pool = batch_service_client.pool.get(self.pool_id) if self.check_for_pool_resize_error(pool): self.status = utils.JobStatus( utils.JobState.POOL_FAILED, "Job failed to start since the pool [{}] failed to allocate any TVMs due to " "error [Code: {}, message {}]. ".format( self.pool_id, pool.resize_errors[0].code, pool.resize_errors[0].message)) return False return True
def __init__(self, template_file: str, pool_template_file: str, parameters_file: str, keyvault_client_with_url: tuple, expected_output: str, application_licenses: str = None, repository_branch_name: str = None): super(JobManager, self).__init__() self.raw_job_id = ctm.get_job_id( parameters_file) # The attribute 'raw_job_id' of type 'str' self.job_id = _time + "-" + self.raw_job_id # The attribute 'job_id' of type 'str' self.pool_id = _time + "-" + ctm.get_pool_id( parameters_file) # The attribute 'pool_id' of type 'str' self.template_file = template_file # The attribute 'template_file' of type 'str' self.parameters_file = parameters_file # The attribute 'parameters_file' of type 'str ' self.keyvault_client_with_url = keyvault_client_with_url # The attribute 'keyvault_client_with_url' of type 'tuple' self.application_licenses = application_licenses # The attribute 'application_licenses' of type 'str' self.repository_branch_name = repository_branch_name # The attribute 'repository_branch_name' of type 'str' self.expected_output = expected_output # The attribute 'expected_output' of type 'str' self.pool_template_file = pool_template_file # The attribute 'pool_template_file' of type 'str' self.storage_info = None # The attribute 'storage_info' of type 'utils.StorageInfo' self.status = utils.JobStatus( utils.JobState.NOT_STARTED, "Job hasn't started yet." ) # The attribute 'status' of type 'utils.JobState' self.duration = None # The attribute 'duration' of type 'timedelta' self.pool_start_duration = None # The attribute 'pool_start_duration' of type 'timedelta' self.start_time = datetime.datetime.now()
def wait_for_steady_tvm(self, batch_service_client: batch.BatchExtensionsClient, timeout: datetime.timedelta) -> bool: """ This method will wait until the pool has TVM available to run the job. :param batch_service_client: A Batch service client. :type batch_service_client: `azure.batch.BatchExtensionsClient` :param timedelta timeout: The duration we wait for task complete. :return: Returns true when their is a valid TVM in an idle state :rtype bool """ pool = batch_service_client.pool.get(self.pool_id) # Wait for pool to come up while pool.allocation_state.value == "resizing" and self.check_time_has_expired( timeout): time.sleep(10) pool = batch_service_client.pool.get(self.pool_id) # Check if pool allocated with resize errors if self.check_for_pool_resize_error(pool): if not self.resize_pool_and_check_for_resize_errors( pool, batch_service_client): return False # Wait for TVMs to become available # Need to cast to a list here since compute_node.list returns an object that contains a list nodes = list(batch_service_client.compute_node.list(self.pool_id)) logger.info("Waiting for a TVM to allocate in pool: [{}]".format( self.pool_id)) while (any( [n for n in nodes if n.state != batchmodels.ComputeNodeState.idle ])) and self.check_time_has_expired(timeout): time.sleep(10) nodes = list(batch_service_client.compute_node.list(self.pool_id)) #determine pool startup duration as the time between pool creation and first node reported as idle for n in nodes: if n.state == batchmodels.ComputeNodeState.idle: self.pool_start_duration = n.state_transition_time - pool.creation_time logger.info("Job [{}] is starting to run on a TVM".format( self.job_id)) return True #if we get here we have timed out without any nodes going to idle self.job_status = utils.JobStatus( utils.JobState.POOL_FAILED, "Failed to start the pool [{}] before [{}], you may want to increase your timeout]." .format(self.pool_id, timeout)) logger.error("POOL [{}] FAILED TO ALLOCATE IN TIME".format( self.pool_id)) return False
def check_for_pool_resize_error(self, pool: str) -> bool: """ Checks to see if there is any pool resize errors. Returns true if there is one, false if there isn't :param pool: The :type pool: The pool we want to inspect for any timeout errors :return: True if there is a resize error :rtype: bool """ if pool.allocation_state.value == "steady" and pool.resize_errors is not None: self.status = utils.JobStatus( utils.JobState.POOL_FAILED, "Job failed to start since the pool [{}] failed to allocate any TVMs due to " "error [Code: {}, message {}]. ".format( self.pool_id, pool.resize_errors[0].code, pool.resize_errors[0].message)) logger.error("POOL {} FAILED TO ALLOCATE".format(self.pool_id)) return True return False
def __init__(self, template_file: str, pool_template_file: str, parameters_file: str, expected_output: str, application_licenses: str = None): super(JobManager, self).__init__() self.raw_job_id = ctm.get_job_id( parameters_file) # The attribute 'raw_job_id' of type 'str' self.job_id = _time + "-" + self.raw_job_id # The attribute 'job_id' of type 'str' self.pool_id = _time + "-" + ctm.get_pool_id( parameters_file) # The attribute 'pool_id' of type 'str' self.template_file = template_file # The attribute 'template_file' of type 'str' self.parameters_file = parameters_file # The attribute 'parameters_file' of type 'str ' self.application_licenses = application_licenses # The attribute 'application_licenses' of type 'str' self.expected_output = expected_output # The attribute 'expected_output' of type 'str' self.pool_template_file = pool_template_file # The attribute 'pool_template_file' of type 'str' self.storage_info = None # The attribute 'storage_info' of type 'utils.StorageInfo' self.status = utils.JobStatus( utils.JobState.NOT_STARTED, "Job hasn't started yet." ) # The attribute 'status' of type 'utils.JobState' self.duration = None # The attribute 'duration' of type 'int'