def start_schedule(self, deployment_path: str): # Load start_schedule_deployment with open(deployment_path, "r") as fr: start_schedule_deployment = yaml.safe_load(fr) schedule_name = start_schedule_deployment["name"] start_schedule_deployment = self._completed_local_job_deployment(start_schedule_deployment) # Check resource is_satisfied, _ = resource_op( self.cluster_details["master"]["resource"], start_schedule_deployment["total_request_resource"], ResourceOperation.ALLOCATION ) if not is_satisfied: raise BadRequestError(f"No enough resource to start schedule {schedule_name} in {self.cluster_name}.") # push schedule details to Redis self._redis_connection.hset( f"{self.cluster_name}:job_details", schedule_name, json.dumps(start_schedule_deployment) ) job_list = start_schedule_deployment["job_names"] # switch schedule details into job details job_detail = copy.deepcopy(start_schedule_deployment) del job_detail["job_names"] for job_name in job_list: job_detail["name"] = job_name self._push_pending_job(job_detail)
def delete(self): logger.info(f"Deleting cluster {self.cluster_name}") # Remove local cluster file. shutil.rmtree(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}", True) # Stop cluster agents. self._agents_stop() # Release cluster resource. available_resource = self._resource_redis.get_available_resource() # Update resource cluster_resource = self.cluster_details["master"]["resource"] _, updated_resource = resource_op( available_resource, cluster_resource, ResourceOperation.RELEASE ) self._resource_redis.set_available_resource(updated_resource) # Rm connection from resource redis. self._resource_redis.sub_cluster() # Clear local redis data. self._redis_clear() logger.info(f"{self.cluster_name} is deleted.")
def _check_pending_ticket(self): # Check pending job ticket pending_jobs = self.redis_connection.lrange(f"{self.cluster_name}:pending_job_tickets", 0, -1) for job_name in pending_jobs: job_detail = json.loads(self.redis_connection.hget(f"{self.cluster_name}:job_details", job_name)) # Allocation cluster_resource = json.loads( self.redis_connection.hget(f"{self.cluster_name}:runtime_detail", "available_resource") ) is_satisfied, updated_resource = resource_op( cluster_resource, job_detail["total_request_resource"], ResourceOperation.ALLOCATION ) if not is_satisfied: continue # Start job self._start_job(job_detail) self.redis_connection.lrem(f"{self.cluster_name}:pending_job_tickets", 0, job_name) self.redis_connection.hset( f"{self.cluster_name}:runtime_detail", "available_resource", json.dumps(updated_resource) )
def _job_clear(self, job_name: str, release_resource: dict): cluster_resource = json.loads( self.redis_connection.hget(f"{self.cluster_name}:runtime_detail", "available_resource") ) # resource release _, updated_resource = resource_op( cluster_resource, release_resource, ResourceOperation.RELEASE ) self.redis_connection.hset( f"{self.cluster_name}:runtime_detail", "available_resource", json.dumps(updated_resource) )
def start_job(self, deployment_path: str): # Load start_job_deployment with open(deployment_path, "r") as fr: start_job_deployment = yaml.safe_load(fr) start_job_deployment = self._completed_local_job_deployment(start_job_deployment) # Check resource is_satisfied, _ = resource_op( self.cluster_details["master"]["resource"], start_job_deployment["total_request_resource"], ResourceOperation.ALLOCATION ) if not is_satisfied: raise BadRequestError(f"No enough resource to start job {start_job_deployment['name']}.") self._push_pending_job(start_job_deployment)
def create(self): logger.info("Creating cluster") # Get cluster name and save cluster details. if os.path.isdir(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}"): raise BadRequestError(f"Cluster '{self.cluster_name}' is exist.") # Build connection with Resource Redis self._resource_redis.add_cluster() # Allocation cluster_resource = self.cluster_details["master"]["resource"] available_resource = self._resource_redis.get_available_resource() # Update resource is_satisfied, updated_resource = resource_op( available_resource, cluster_resource, ResourceOperation.ALLOCATION ) if not is_satisfied: self._resource_redis.sub_cluster() raise BadRequestError("No enough resource for this cluster.") self._resource_redis.set_available_resource(updated_resource) # Start agents. self._agents_start() # Set available resource for cluster self._redis_connection.hset( f"{self.cluster_name}:runtime_detail", "available_resource", json.dumps(cluster_resource) ) # Save cluster config locally. DetailsWriter.save_cluster_details( cluster_name=self.cluster_name, cluster_details=self.cluster_details ) logger.info(f"{self.cluster_name} is created.")