def _delete_workflow_engine_pod(workflow): """Delete workflow engine pod.""" try: jobs = current_k8s_corev1_api_client.list_namespaced_pod( namespace=REANA_RUNTIME_KUBERNETES_NAMESPACE, ) for job in jobs.items: if str(workflow.id_) in job.metadata.name: workflow_enginge_logs = current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=job.metadata.namespace, name=job.metadata.name, container="workflow-engine", ) workflow.logs = (workflow.logs or "") + workflow_enginge_logs + "\n" current_k8s_batchv1_api_client.delete_namespaced_job( namespace=job.metadata.namespace, propagation_policy="Background", name=job.metadata.labels["job-name"], ) break except ApiException as e: raise REANAWorkflowControllerError( "Workflow engine pod cound not be deleted {}.".format(e) ) except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e))
def _delete_workflow_engine_pod(workflow): """Delete workflow engine pod.""" try: jobs = current_k8s_corev1_api_client.list_namespaced_pod( namespace='default', ) for job in jobs.items: if str(workflow.id_) in job.metadata.name: workflow_enginge_logs = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=job.metadata.namespace, name=job.metadata.name, container='workflow-engine') workflow.logs = \ (workflow.logs or '') + workflow_enginge_logs + '\n' current_k8s_batchv1_api_client.delete_namespaced_job( namespace='default', propagation_policy="Background", name=job.metadata.labels['job-name']) break except ApiException as e: raise REANAWorkflowControllerError( "Workflow engine pod cound not be deleted {}.".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e))
def get_k8s_jobs_by_status(self, status): """Get from k8s API jobs in ``status`` status.""" pods = current_k8s_corev1_api_client.list_namespaced_pod( REANA_RUNTIME_KUBERNETES_NAMESPACE, field_selector=f"status.phase={status}", ) job_pods = [ pod.metadata.name for pod in pods.items if pod.metadata.name.startswith(f"{REANA_COMPONENT_PREFIX}-run-job") ] return job_pods
def _get_workflow_engine_pod_logs(workflow: Workflow) -> str: try: pods = current_k8s_corev1_api_client.list_namespaced_pod( namespace=REANA_RUNTIME_KUBERNETES_NAMESPACE, label_selector=f"reana-run-batch-workflow-uuid={str(workflow.id_)}", ) for pod in pods.items: if str(workflow.id_) in pod.metadata.name: return current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=pod.metadata.namespace, name=pod.metadata.name, container="workflow-engine", ) except ApiException as e: raise REANAWorkflowControllerError( f"Workflow engine pod logs could not be fetched. Error: {e}")
def k8s_watch_jobs(job_db): """Open stream connection to k8s apiserver to watch all jobs status. :param job_db: Dictionary which contains all current jobs. :param config: configuration to connect to k8s apiserver. """ while True: logging.debug('Starting a new stream request to watch Jobs') try: w = watch.Watch() for event in w.stream(current_k8s_batchv1_api_client. list_job_for_all_namespaces): logging.info('New Job event received: {0}'.format( event['type'])) job = event['object'] # Taking note of the remaining jobs since deletion might not # happend straight away. remaining_jobs = [ j for j in job_db.keys() if not job_db[j]['deleted'] ] if (not job_db.get(job.metadata.name) or job.metadata.name not in remaining_jobs): # Ignore jobs not created by this specific instance # or already deleted jobs. continue elif job.status.succeeded: logging.info('Job {} succeeded.'.format(job.metadata.name)) job_db[job.metadata.name]['status'] = 'succeeded' elif (job.status.failed and job.status.failed >= config.MAX_JOB_RESTARTS): logging.info('Job {} failed.'.format(job.metadata.name)) job_db[job.metadata.name]['status'] = 'failed' else: continue # Grab logs when job either succeeds or fails. logging.info('Getting last spawned pod for job {}'.format( job.metadata.name)) last_spawned_pod = \ current_k8s_corev1_api_client.list_namespaced_pod( job.metadata.namespace, label_selector='job-name={job_name}'.format( job_name=job.metadata.name)).items[-1] logging.info('Grabbing pod {} logs...'.format( last_spawned_pod.metadata.name)) job_db[job.metadata.name]['log'] = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=last_spawned_pod.metadata.namespace, name=last_spawned_pod.metadata.name) # Store job logs try: logging.info('Storing job logs: {}'.format( job_db[job.metadata.name]['log'])) Session.query(Job).filter_by(id_=job.metadata.name). \ update(dict(logs=job_db[job.metadata.name]['log'])) Session.commit() except Exception as e: logging.debug( 'Could not retrieve' ' logs for object: {}'.format(last_spawned_pod)) logging.debug('Exception: {}'.format(str(e))) logging.info('Cleaning job {} ...'.format(job.metadata.name)) k8s_delete_job(job) job_db[job.metadata.name]['deleted'] = True except client.rest.ApiException as e: logging.debug( "Error while connecting to Kubernetes API: {}".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.debug("Unexpected error: {}".format(e))
def watch_jobs_kubernetes(job_db): """Open stream connection to k8s apiserver to watch all jobs status. :param job_db: Dictionary which contains all current jobs. """ while True: logging.debug('Starting a new stream request to watch Jobs') try: w = watch.Watch() for event in w.stream(current_k8s_batchv1_api_client. list_job_for_all_namespaces): logging.info('New Job event received: {0}'.format( event['type'])) job = event['object'] # Taking note of the remaining jobs since deletion might not # happen straight away. remaining_jobs = dict() for job_id, job_dict in job_db.items(): if not job_db[job_id]['deleted']: remaining_jobs[job_dict['backend_job_id']] = job_id if (not job_db.get(remaining_jobs.get(job.metadata.name)) or job.metadata.name not in remaining_jobs): # Ignore jobs not created by this specific instance # or already deleted jobs. continue job_id = remaining_jobs[job.metadata.name] kubernetes_job_id = job.metadata.name if job.status.succeeded: logging.info('Job job_id: {}, kubernetes_job_id: {}' ' succeeded.'.format(job_id, kubernetes_job_id)) job_db[job_id]['status'] = 'succeeded' elif (job.status.failed and job.status.failed >= config.MAX_JOB_RESTARTS): logging.info( 'Job job_id: {}, kubernetes_job_id: {} failed.'.format( job_id, kubernetes_job_id)) job_db[job_id]['status'] = 'failed' else: continue # Grab logs when job either succeeds or fails. logging.info('Getting last spawned pod for kubernetes' ' job {}'.format(kubernetes_job_id)) last_spawned_pod = \ current_k8s_corev1_api_client.list_namespaced_pod( namespace=job.metadata.namespace, label_selector='job-name={job_name}'.format( job_name=kubernetes_job_id)).items[-1] logging.info('Grabbing pod {} logs...'.format( last_spawned_pod.metadata.name)) job_db[job_id]['log'] = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=last_spawned_pod.metadata.namespace, name=last_spawned_pod.metadata.name) store_logs(job_id=job_id, logs=job_db[job_id]['log']) logging.info( 'Cleaning Kubernetes job {} ...'.format(kubernetes_job_id)) KubernetesJobManager.stop(kubernetes_job_id) job_db[job_id]['deleted'] = True except client.rest.ApiException as e: logging.debug( "Error while connecting to Kubernetes API: {}".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.debug("Unexpected error: {}".format(e))
def get_pods_by_status(self, status, namespace): """Get pod name list by status.""" pods = current_k8s_corev1_api_client.list_namespaced_pod( namespace, field_selector=f"status.phase={status}", ) return [pod.metadata.name for pod in pods.items]