Esempio n. 1
0
 def __get_container_log(self, container_name):
     """ Returns the logs for the specified container name in the currently running job """
     response = api_request(self.core_api.list_namespaced_pod,
                            namespace=self.namespace,
                            label_selector=self.inst_name,
                            watch=False,
                            pretty='true')
     if response.get("error"):
         logging.warning(
             f"Failed to retrieve logs for container {container_name} in job {self.inst_name}"
         )
         return ""
     # Loop through all the pods to find the pods for the job
     for pod in response.get("items"):
         if pod.get("metadata",
                    {}).get("labels", {}).get("job-name") == self.inst_name:
             pod_name = pod.get("metadata", {}).get("name", '')
             if pod_name:
                 response = api_request(
                     self.core_api.read_namespaced_pod_log,
                     pod_name,
                     self.namespace,
                     container=container_name,
                     follow=False,
                     pretty='true')
                 return response
     logging.warning(
         f"Failed to retrieve logs for container {container_name} in job {self.inst_name}"
     )
     return ""
Esempio n. 2
0
 def __launch_job(self, log_yaml=True):
     try:
         creation_response = api_request(
             self.batch_api.create_namespaced_job, self.namespace,
             self.job_def)
         creation_status = creation_response.get("status", None)
         if creation_status and isinstance(
                 creation_status, dict) and creation_status != 'Failure':
             if log_yaml:
                 job_yaml = yaml.dump(creation_response).split("\n")
                 stripped_yaml = []
                 for line in job_yaml:
                     if ": null" not in line and "status:" not in line and "self_link":
                         stripped_yaml.append(line)
                 job_yaml = "\n".join(stripped_yaml)
                 logging.debug(
                     f"({self.name}) KUBERENETES JOB YAML : \n\n{job_yaml}")
                 logging.debug(
                     f"({self.name}) Kubernetes job successfully created! Begin monitoring."
                 )
         else:
             raise RuntimeError(
                 f"({self.name}) Failure to create the job on the cluster")
     except Exception as e:
         raise RuntimeError(
             f"({self.name}) Failure to create the job on the cluster Reason: {str(e)}"
         )
Esempio n. 3
0
 def __cleanup_pods(self, job_name):
     response = api_request(self.core_api.list_namespaced_pod,
                            namespace=self.namespace,
                            label_selector=job_name,
                            watch=False,
                            pretty='true')
     if response.get("error"):
         logging.warning(f"Failed to delete pods for job {job_name}")
         return ""
     # Loop through all the pods to delete them from the cluster
     for pod in response.get("items"):
         if pod.get("metadata", {}).get("labels",
                                        {}).get("job-name") == job_name:
             pod_name = pod.get("metadata", {}).get("name", '')
             if pod_name:
                 response = api_request(self.core_api.delete_namespaced_pod,
                                        pod_name, self.namespace)
                 logging.debug(f"({self.name}) Pod removed successfully")
Esempio n. 4
0
 def validate(self):
     try:
         namespace_list = api_request(self.batch_api.list_namespaced_job,
                                      namespace='cloud-conductor')
         if not namespace_list or not namespace_list.items:
             logging.error("Failed to validate Kubernetes platform.")
             raise RuntimeError("Failed to validate Kubernetes platform.")
     except BaseException as e:
         logging.error(f"Failed to validate Kubernetes platform.")
         raise
Esempio n. 5
0
 def update_statuses(self):
     try:
         response = api_request(self.batch_api.list_namespaced_job,
                                namespace='cloud-conductor')
         if response and response.get("items"):
             self.job_list = {
                 job.get("metadata", {}).get("name"): job
                 for job in response.get("items")
             }
     except Exception:
         logging.error(
             "Error with updating the job list to check statuses.")
Esempio n. 6
0
    def __rebuild_job(self, preemptible=False):
        self.__cleanup_job()
        self.preemptible = preemptible
        self.node_label, self.nodepool_info = self.get_nodepool_info()
        node_label_dict = {'poolName': str(self.node_label)}
        job_exists = True
        while job_exists:
            response = api_request(self.batch_api.read_namespaced_job,
                                   name=self.inst_name,
                                   namespace=self.namespace)
            if response and 'not found' in response.get('message', ''):
                job_exists = False
                break
            time.sleep(30)

        self.job_def.spec.template.spec.node_selector = node_label_dict
        self.__launch_job(log_yaml=False)
Esempio n. 7
0
    def __create_volume_claim(self):
        # create the persistent volume claim for the task
        self.pvc_name = self.name + '-vc'
        pvc_meta = client.V1ObjectMeta(name=self.pvc_name,
                                       namespace=self.namespace)
        pvc_resources = client.V1ResourceRequirements(
            requests={'storage': str(self.disk_space) + 'Gi'})
        pvc_spec = client.V1PersistentVolumeClaimSpec(
            access_modes=['ReadWriteOnce'],
            resources=pvc_resources,
            storage_class_name='standard')
        self.task_pvc = client.V1PersistentVolumeClaim(metadata=pvc_meta,
                                                       spec=pvc_spec)

        for i in range(10):
            try:
                pvc_response = api_request(
                    self.core_api.create_namespaced_persistent_volume_claim,
                    self.namespace, self.task_pvc)
            except Exception as e:
                raise RuntimeError(
                    f"({self.name}) Failure to create the Persistent Volume Claim on the cluster. Reason: {str(e)}"
                )

            # Save the status if the job is no longer active
            pvc_status = pvc_response.get("status", None)
            if pvc_status and isinstance(pvc_status, dict):
                logging.debug(
                    f"({self.name}) Persistent Volume Claim created.")
                break
            else:
                if 'Connection aborted' in str(
                        pvc_response) or 'Connection reset' in str(
                            pvc_response):
                    sleep_time = get_api_sleep(i + 1)
                    logging.debug(
                        f"({self.name}) Connection issue when creating Persistent Volume Claim. Sleeping for: {sleep_time}"
                    )
                    time.sleep(sleep_time)
                    continue
                else:
                    raise RuntimeError(
                        f"({self.name}) Failure to create a Persistent Volume Claim on the cluster. Response: {str(pvc_response)}"
                    )
Esempio n. 8
0
    def __cleanup_volume_claim(self):
        if self.pvc_name:
            # Destroy the persistent volume claim
            pvc_response = api_request(
                self.core_api.delete_namespaced_persistent_volume_claim,
                self.pvc_name, self.namespace)

            # Save the status if the job is no longer active
            pvc_status = pvc_response.get("status", None)
            if pvc_status and pvc_status == 'Failure':
                if 'not found' not in pvc_response.get('message', ''):
                    logging.warning(
                        f"({self.name}) Failed to destroy Persistent Volume Claim. Message: {pvc_response.get('message', '')}"
                    )
            elif pvc_status and not isinstance(pvc_status, dict):
                pvc_status = ast.literal_eval(pvc_status)
            else:
                logging.debug(
                    f"({self.name}) Persistent Volume Claim successfully destroyed."
                )
Esempio n. 9
0
    def __cleanup_job(self):
        # Destroy the job
        if self.job_names:
            for job_name in self.job_names:
                delete_response = api_request(
                    self.batch_api.delete_namespaced_job, job_name,
                    self.namespace)

                # Save the status if the job is no longer active
                delete_status = delete_response.get("status", None)
                if delete_status and delete_status == 'Failure':
                    if 'not found' not in delete_response.get('message', ''):
                        logging.warning(
                            f"({self.name}) Failed to destroy Kubernetes Job. Message: {delete_response.get('message', '')}"
                        )
                elif delete_status and not isinstance(delete_status, dict):
                    delete_status = ast.literal_eval(delete_status)
                else:
                    logging.debug(
                        f"({job_name}) Kubernetes job successfully destroyed.")
                # Destroy all pods associated with the job as well
                self.__cleanup_pods(job_name)
Esempio n. 10
0
 def __get_failed_container(self):
     """ Returns the logs for the specified container name in the currently running job """
     response = api_request(self.core_api.list_namespaced_pod,
                            namespace=self.namespace,
                            label_selector=self.inst_name,
                            watch=False,
                            pretty='true')
     if response.get("items"):
         pod = response["items"][len(response["items"]) - 1]
         pod_name = pod.get("metadata", {}).get("name", '')
         module_log = ''
         init_container_statuses = pod['status']['init_container_statuses']
         container_index = 0
         if init_container_statuses:
             for status in init_container_statuses:
                 if not status['ready']:
                     failed_container = pod['spec']['init_containers'][
                         container_index]
                     failed_container['log'] = api_request(
                         self.core_api.read_namespaced_pod_log,
                         pod_name,
                         self.namespace,
                         container=status['name'],
                         follow=False,
                         pretty='true')
                     failed_container['module_log'] = module_log
                     return failed_container
                 else:
                     module_log += '\n' + api_request(
                         self.core_api.read_namespaced_pod_log,
                         pod_name,
                         self.namespace,
                         container=status['name'],
                         follow=False,
                         pretty='true')
                 container_index += 1
         container_statuses = pod['status']['container_statuses']
         container_index = 0
         if container_statuses:
             for status in container_statuses:
                 if not status['ready']:
                     failed_container = pod['spec']['containers'][
                         container_index]
                     failed_container['log'] = api_request(
                         self.core_api.read_namespaced_pod_log,
                         pod_name,
                         self.namespace,
                         container=status['name'],
                         follow=False,
                         pretty='true')
                     failed_container['module_log'] = module_log
                     return failed_container
                 else:
                     module_log += '\n' + api_request(
                         self.core_api.read_namespaced_pod_log,
                         pod_name,
                         self.namespace,
                         container=status['name'],
                         follow=False,
                         pretty='true')
                 container_index += 1
     logging.warning(
         f"Failed to retrieve failed container in job {self.inst_name}")
     return None