def _update_node_details(self) -> None: # Get node details node_details = get_node_details(redis=self._redis, cluster_name=self._cluster_name, node_name=self._node_name) # Main update self._update_container_details(node_details=node_details) self._update_system_resources_details(node_details=node_details) # Other update node_details['state'] = 'Running' node_details['check_time'] = self._redis.time()[0] # Save node details set_node_details(redis=self._redis, cluster_name=self._cluster_name, node_name=self._node_name, node_details=node_details)
parser.add_argument('node_name') parser.add_argument('parallels', type=int) args = parser.parse_args() # Load details cluster_details = load_cluster_details(cluster_name=args.cluster_name) master_hostname = cluster_details['master']['hostname'] redis_port = cluster_details['master']['redis']['port'] # Load node details redis = Redis(host=master_hostname, port=redis_port, charset="utf-8", decode_responses=True) node_details = get_node_details(redis=redis, cluster_name=args.cluster_name, node_name=args.node_name) master_details = get_master_details(redis=redis, cluster_name=args.cluster_name) master_image_files_details = master_details['image_files'] node_image_files_details = node_details['image_files'] # Get unloaded images unloaded_images = [] for image_file, image_file_details in master_image_files_details.items(): if image_file not in node_image_files_details: unloaded_images.append(image_file) elif image_file_details['modify_time'] != node_image_files_details[image_file]['modify_time'] or \ image_file_details['size'] != node_image_files_details[image_file]['size']: unloaded_images.append(image_file) sys.stdout.write(f"Unloaded_images: {unloaded_images}\n")
def start_container(self, container_name: str, node_name: str, job_details: dict): # Load details and vars cluster_name = self._cluster_name component_id_to_component_type = self._get_component_id_to_component_type( job_details=job_details) component_id = container_name.split('-')[-2] component_index = container_name.split('-')[-1] component_type = component_id_to_component_type[component_id] job_name = job_details['name'] job_id = job_details['id'] cluster_id = self._cluster_id admin_username = self._admin_username node_details = get_node_details(redis=self._redis, cluster_name=self._cluster_name, node_name=node_name) node_hostname = node_details['hostname'] master_hostname = self._master_hostname # Parse environment parameters environment_parameters = \ f"-e COMPONENT_TYPE={component_type} " \ f"-e COMPONENT_ID={component_id} " \ f"-e COMPONENT_INDEX={component_index} " \ f"-e JOB_NAME={job_name} " \ f"-e JOB_ID={job_id} " \ f"-e CLUSTER_NAME={cluster_name} " \ f"-e CLUSTER_ID={cluster_id} " \ f"-e PYTHONUNBUFFERED=0" # Load command if job_details['components'][component_type]['resources']['gpu'] != 0: command = START_CONTAINER_WITH_GPU_COMMAND else: command = START_CONTAINER_COMMAND command = command.format( # cluster related admin_username=admin_username, master_hostname=master_hostname, node_hostname=node_hostname, fluentd_port=self._fluentd_port, # job related (user) cpu=job_details['components'][component_type]['resources']['cpu'], memory=job_details['components'][component_type]['resources'] ['memory'], gpu=job_details['components'][component_type]['resources']['gpu'], mount_target=job_details['components'][component_type]['mount'] ['target'], command=job_details['components'][component_type]['command'], image_name=job_details['components'][component_type]['image'], # job related (system) container_name=container_name, job_id=job_id, mount_source=f"~/.maro/clusters/{cluster_name}/data/", environment_parameters=environment_parameters) # Exec command logger.info(command) completed_process = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf8') if completed_process.returncode != 0: raise AllocationFailed(completed_process.stderr)