def __get_container_log(self, container_name): """ Returns the logs for the specified container name in the currently running job """ response = api_request(self.core_api.list_namespaced_pod, namespace=self.namespace, label_selector=self.inst_name, watch=False, pretty='true') if response.get("error"): logging.warning( f"Failed to retrieve logs for container {container_name} in job {self.inst_name}" ) return "" # Loop through all the pods to find the pods for the job for pod in response.get("items"): if pod.get("metadata", {}).get("labels", {}).get("job-name") == self.inst_name: pod_name = pod.get("metadata", {}).get("name", '') if pod_name: response = api_request( self.core_api.read_namespaced_pod_log, pod_name, self.namespace, container=container_name, follow=False, pretty='true') return response logging.warning( f"Failed to retrieve logs for container {container_name} in job {self.inst_name}" ) return ""
def __launch_job(self, log_yaml=True): try: creation_response = api_request( self.batch_api.create_namespaced_job, self.namespace, self.job_def) creation_status = creation_response.get("status", None) if creation_status and isinstance( creation_status, dict) and creation_status != 'Failure': if log_yaml: job_yaml = yaml.dump(creation_response).split("\n") stripped_yaml = [] for line in job_yaml: if ": null" not in line and "status:" not in line and "self_link": stripped_yaml.append(line) job_yaml = "\n".join(stripped_yaml) logging.debug( f"({self.name}) KUBERENETES JOB YAML : \n\n{job_yaml}") logging.debug( f"({self.name}) Kubernetes job successfully created! Begin monitoring." ) else: raise RuntimeError( f"({self.name}) Failure to create the job on the cluster") except Exception as e: raise RuntimeError( f"({self.name}) Failure to create the job on the cluster Reason: {str(e)}" )
def __cleanup_pods(self, job_name): response = api_request(self.core_api.list_namespaced_pod, namespace=self.namespace, label_selector=job_name, watch=False, pretty='true') if response.get("error"): logging.warning(f"Failed to delete pods for job {job_name}") return "" # Loop through all the pods to delete them from the cluster for pod in response.get("items"): if pod.get("metadata", {}).get("labels", {}).get("job-name") == job_name: pod_name = pod.get("metadata", {}).get("name", '') if pod_name: response = api_request(self.core_api.delete_namespaced_pod, pod_name, self.namespace) logging.debug(f"({self.name}) Pod removed successfully")
def validate(self): try: namespace_list = api_request(self.batch_api.list_namespaced_job, namespace='cloud-conductor') if not namespace_list or not namespace_list.items: logging.error("Failed to validate Kubernetes platform.") raise RuntimeError("Failed to validate Kubernetes platform.") except BaseException as e: logging.error(f"Failed to validate Kubernetes platform.") raise
def update_statuses(self): try: response = api_request(self.batch_api.list_namespaced_job, namespace='cloud-conductor') if response and response.get("items"): self.job_list = { job.get("metadata", {}).get("name"): job for job in response.get("items") } except Exception: logging.error( "Error with updating the job list to check statuses.")
def __rebuild_job(self, preemptible=False): self.__cleanup_job() self.preemptible = preemptible self.node_label, self.nodepool_info = self.get_nodepool_info() node_label_dict = {'poolName': str(self.node_label)} job_exists = True while job_exists: response = api_request(self.batch_api.read_namespaced_job, name=self.inst_name, namespace=self.namespace) if response and 'not found' in response.get('message', ''): job_exists = False break time.sleep(30) self.job_def.spec.template.spec.node_selector = node_label_dict self.__launch_job(log_yaml=False)
def __create_volume_claim(self): # create the persistent volume claim for the task self.pvc_name = self.name + '-vc' pvc_meta = client.V1ObjectMeta(name=self.pvc_name, namespace=self.namespace) pvc_resources = client.V1ResourceRequirements( requests={'storage': str(self.disk_space) + 'Gi'}) pvc_spec = client.V1PersistentVolumeClaimSpec( access_modes=['ReadWriteOnce'], resources=pvc_resources, storage_class_name='standard') self.task_pvc = client.V1PersistentVolumeClaim(metadata=pvc_meta, spec=pvc_spec) for i in range(10): try: pvc_response = api_request( self.core_api.create_namespaced_persistent_volume_claim, self.namespace, self.task_pvc) except Exception as e: raise RuntimeError( f"({self.name}) Failure to create the Persistent Volume Claim on the cluster. Reason: {str(e)}" ) # Save the status if the job is no longer active pvc_status = pvc_response.get("status", None) if pvc_status and isinstance(pvc_status, dict): logging.debug( f"({self.name}) Persistent Volume Claim created.") break else: if 'Connection aborted' in str( pvc_response) or 'Connection reset' in str( pvc_response): sleep_time = get_api_sleep(i + 1) logging.debug( f"({self.name}) Connection issue when creating Persistent Volume Claim. Sleeping for: {sleep_time}" ) time.sleep(sleep_time) continue else: raise RuntimeError( f"({self.name}) Failure to create a Persistent Volume Claim on the cluster. Response: {str(pvc_response)}" )
def __cleanup_volume_claim(self): if self.pvc_name: # Destroy the persistent volume claim pvc_response = api_request( self.core_api.delete_namespaced_persistent_volume_claim, self.pvc_name, self.namespace) # Save the status if the job is no longer active pvc_status = pvc_response.get("status", None) if pvc_status and pvc_status == 'Failure': if 'not found' not in pvc_response.get('message', ''): logging.warning( f"({self.name}) Failed to destroy Persistent Volume Claim. Message: {pvc_response.get('message', '')}" ) elif pvc_status and not isinstance(pvc_status, dict): pvc_status = ast.literal_eval(pvc_status) else: logging.debug( f"({self.name}) Persistent Volume Claim successfully destroyed." )
def __cleanup_job(self): # Destroy the job if self.job_names: for job_name in self.job_names: delete_response = api_request( self.batch_api.delete_namespaced_job, job_name, self.namespace) # Save the status if the job is no longer active delete_status = delete_response.get("status", None) if delete_status and delete_status == 'Failure': if 'not found' not in delete_response.get('message', ''): logging.warning( f"({self.name}) Failed to destroy Kubernetes Job. Message: {delete_response.get('message', '')}" ) elif delete_status and not isinstance(delete_status, dict): delete_status = ast.literal_eval(delete_status) else: logging.debug( f"({job_name}) Kubernetes job successfully destroyed.") # Destroy all pods associated with the job as well self.__cleanup_pods(job_name)
def __get_failed_container(self): """ Returns the logs for the specified container name in the currently running job """ response = api_request(self.core_api.list_namespaced_pod, namespace=self.namespace, label_selector=self.inst_name, watch=False, pretty='true') if response.get("items"): pod = response["items"][len(response["items"]) - 1] pod_name = pod.get("metadata", {}).get("name", '') module_log = '' init_container_statuses = pod['status']['init_container_statuses'] container_index = 0 if init_container_statuses: for status in init_container_statuses: if not status['ready']: failed_container = pod['spec']['init_containers'][ container_index] failed_container['log'] = api_request( self.core_api.read_namespaced_pod_log, pod_name, self.namespace, container=status['name'], follow=False, pretty='true') failed_container['module_log'] = module_log return failed_container else: module_log += '\n' + api_request( self.core_api.read_namespaced_pod_log, pod_name, self.namespace, container=status['name'], follow=False, pretty='true') container_index += 1 container_statuses = pod['status']['container_statuses'] container_index = 0 if container_statuses: for status in container_statuses: if not status['ready']: failed_container = pod['spec']['containers'][ container_index] failed_container['log'] = api_request( self.core_api.read_namespaced_pod_log, pod_name, self.namespace, container=status['name'], follow=False, pretty='true') failed_container['module_log'] = module_log return failed_container else: module_log += '\n' + api_request( self.core_api.read_namespaced_pod_log, pod_name, self.namespace, container=status['name'], follow=False, pretty='true') container_index += 1 logging.warning( f"Failed to retrieve failed container in job {self.inst_name}") return None