def get_job_logs(self, job_pod): """Get job pod's containers' logs.""" try: pod_logs = "" # job_pod = current_k8s_corev1_api_client.read_namespaced_pod( # namespace='default', # name=job_pod.metadata.name) # we probably don't need this call again... FIXME container_statuses = job_pod.status.container_statuses + ( job_pod.status.init_container_statuses or []) logging.info("Grabbing pod {} logs ...".format( job_pod.metadata.name)) for container in container_statuses: if container.state.terminated: container_log = current_k8s_corev1_api_client.read_namespaced_pod_log( namespace="default", name=job_pod.metadata.name, container=container.name, ) pod_logs += "{}: :\n {}\n".format(container.name, container_log) elif container.state.waiting: pod_logs += "Container {} failed, error: {}".format( container.name, container.state.waiting.message) return pod_logs except client.rest.ApiException as e: logging.error( "Error while connecting to Kubernetes API: {}".format(e)) return None except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e)) return None
def get_container_logs(self, job_id): """Get job pod's containers' logs.""" try: pod_logs = '' pod = current_k8s_corev1_api_client.read_namespaced_pod( namespace='default', name=job_id) containers = pod.spec.init_containers + pod.spec.containers \ if pod.spec.init_containers else pod.spec.containers for container in containers: container_log = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace='default', name=job_id, container=container.name) pod_logs += '{}: \n {} \n'.format(container.name, container_log) return pod_logs except client.rest.ApiException as e: logging.error( "Error while connecting to Kubernetes API: {}".format(e)) return None except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e)) return None
def _delete_workflow_engine_pod(workflow): """Delete workflow engine pod.""" try: jobs = current_k8s_corev1_api_client.list_namespaced_pod( namespace=REANA_RUNTIME_KUBERNETES_NAMESPACE, ) for job in jobs.items: if str(workflow.id_) in job.metadata.name: workflow_enginge_logs = current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=job.metadata.namespace, name=job.metadata.name, container="workflow-engine", ) workflow.logs = (workflow.logs or "") + workflow_enginge_logs + "\n" current_k8s_batchv1_api_client.delete_namespaced_job( namespace=job.metadata.namespace, propagation_policy="Background", name=job.metadata.labels["job-name"], ) break except ApiException as e: raise REANAWorkflowControllerError( "Workflow engine pod cound not be deleted {}.".format(e) ) except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e))
def _delete_workflow_engine_pod(workflow): """Delete workflow engine pod.""" try: jobs = current_k8s_corev1_api_client.list_namespaced_pod( namespace='default', ) for job in jobs.items: if str(workflow.id_) in job.metadata.name: workflow_enginge_logs = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=job.metadata.namespace, name=job.metadata.name, container='workflow-engine') workflow.logs = \ (workflow.logs or '') + workflow_enginge_logs + '\n' current_k8s_batchv1_api_client.delete_namespaced_job( namespace='default', propagation_policy="Background", name=job.metadata.labels['job-name']) break except ApiException as e: raise REANAWorkflowControllerError( "Workflow engine pod cound not be deleted {}.".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e))
def get_job_logs(self, job_pod): """Get job pod's containers' logs.""" try: pod_logs = "" container_statuses = (job_pod.status.container_statuses or []) + ( job_pod.status.init_container_statuses or []) logging.info("Grabbing pod {} logs ...".format( job_pod.metadata.name)) for container in container_statuses: if container.state.terminated: container_log = current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=REANA_RUNTIME_KUBERNETES_NAMESPACE, name=job_pod.metadata.name, container=container.name, ) pod_logs += "{}: :\n {}\n".format(container.name, container_log) if hasattr(container.state.terminated, "reason"): pod_logs += "\n{}\n".format( container.state.terminated.reason) elif container.state.waiting: pod_logs += "Container {} failed, error: {}".format( container.name, container.state.waiting.message) return pod_logs except client.rest.ApiException as e: logging.error( "Error while connecting to Kubernetes API: {}".format(e)) return None except Exception as e: logging.error(traceback.format_exc()) logging.error("Unexpected error: {}".format(e)) return None
def _get_workflow_engine_pod_logs(workflow: Workflow) -> str: try: pods = current_k8s_corev1_api_client.list_namespaced_pod( namespace=REANA_RUNTIME_KUBERNETES_NAMESPACE, label_selector=f"reana-run-batch-workflow-uuid={str(workflow.id_)}", ) for pod in pods.items: if str(workflow.id_) in pod.metadata.name: return current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=pod.metadata.namespace, name=pod.metadata.name, container="workflow-engine", ) except ApiException as e: raise REANAWorkflowControllerError( f"Workflow engine pod logs could not be fetched. Error: {e}")
def k8s_watch_jobs(job_db): """Open stream connection to k8s apiserver to watch all jobs status. :param job_db: Dictionary which contains all current jobs. :param config: configuration to connect to k8s apiserver. """ while True: logging.debug('Starting a new stream request to watch Jobs') try: w = watch.Watch() for event in w.stream(current_k8s_batchv1_api_client. list_job_for_all_namespaces): logging.info('New Job event received: {0}'.format( event['type'])) job = event['object'] # Taking note of the remaining jobs since deletion might not # happend straight away. remaining_jobs = [ j for j in job_db.keys() if not job_db[j]['deleted'] ] if (not job_db.get(job.metadata.name) or job.metadata.name not in remaining_jobs): # Ignore jobs not created by this specific instance # or already deleted jobs. continue elif job.status.succeeded: logging.info('Job {} succeeded.'.format(job.metadata.name)) job_db[job.metadata.name]['status'] = 'succeeded' elif (job.status.failed and job.status.failed >= config.MAX_JOB_RESTARTS): logging.info('Job {} failed.'.format(job.metadata.name)) job_db[job.metadata.name]['status'] = 'failed' else: continue # Grab logs when job either succeeds or fails. logging.info('Getting last spawned pod for job {}'.format( job.metadata.name)) last_spawned_pod = \ current_k8s_corev1_api_client.list_namespaced_pod( job.metadata.namespace, label_selector='job-name={job_name}'.format( job_name=job.metadata.name)).items[-1] logging.info('Grabbing pod {} logs...'.format( last_spawned_pod.metadata.name)) job_db[job.metadata.name]['log'] = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=last_spawned_pod.metadata.namespace, name=last_spawned_pod.metadata.name) # Store job logs try: logging.info('Storing job logs: {}'.format( job_db[job.metadata.name]['log'])) Session.query(Job).filter_by(id_=job.metadata.name). \ update(dict(logs=job_db[job.metadata.name]['log'])) Session.commit() except Exception as e: logging.debug( 'Could not retrieve' ' logs for object: {}'.format(last_spawned_pod)) logging.debug('Exception: {}'.format(str(e))) logging.info('Cleaning job {} ...'.format(job.metadata.name)) k8s_delete_job(job) job_db[job.metadata.name]['deleted'] = True except client.rest.ApiException as e: logging.debug( "Error while connecting to Kubernetes API: {}".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.debug("Unexpected error: {}".format(e))
def watch_jobs_kubernetes(job_db): """Open stream connection to k8s apiserver to watch all jobs status. :param job_db: Dictionary which contains all current jobs. """ while True: logging.debug('Starting a new stream request to watch Jobs') try: w = watch.Watch() for event in w.stream(current_k8s_batchv1_api_client. list_job_for_all_namespaces): logging.info('New Job event received: {0}'.format( event['type'])) job = event['object'] # Taking note of the remaining jobs since deletion might not # happen straight away. remaining_jobs = dict() for job_id, job_dict in job_db.items(): if not job_db[job_id]['deleted']: remaining_jobs[job_dict['backend_job_id']] = job_id if (not job_db.get(remaining_jobs.get(job.metadata.name)) or job.metadata.name not in remaining_jobs): # Ignore jobs not created by this specific instance # or already deleted jobs. continue job_id = remaining_jobs[job.metadata.name] kubernetes_job_id = job.metadata.name if job.status.succeeded: logging.info('Job job_id: {}, kubernetes_job_id: {}' ' succeeded.'.format(job_id, kubernetes_job_id)) job_db[job_id]['status'] = 'succeeded' elif (job.status.failed and job.status.failed >= config.MAX_JOB_RESTARTS): logging.info( 'Job job_id: {}, kubernetes_job_id: {} failed.'.format( job_id, kubernetes_job_id)) job_db[job_id]['status'] = 'failed' else: continue # Grab logs when job either succeeds or fails. logging.info('Getting last spawned pod for kubernetes' ' job {}'.format(kubernetes_job_id)) last_spawned_pod = \ current_k8s_corev1_api_client.list_namespaced_pod( namespace=job.metadata.namespace, label_selector='job-name={job_name}'.format( job_name=kubernetes_job_id)).items[-1] logging.info('Grabbing pod {} logs...'.format( last_spawned_pod.metadata.name)) job_db[job_id]['log'] = \ current_k8s_corev1_api_client.read_namespaced_pod_log( namespace=last_spawned_pod.metadata.namespace, name=last_spawned_pod.metadata.name) store_logs(job_id=job_id, logs=job_db[job_id]['log']) logging.info( 'Cleaning Kubernetes job {} ...'.format(kubernetes_job_id)) KubernetesJobManager.stop(kubernetes_job_id) job_db[job_id]['deleted'] = True except client.rest.ApiException as e: logging.debug( "Error while connecting to Kubernetes API: {}".format(e)) except Exception as e: logging.error(traceback.format_exc()) logging.debug("Unexpected error: {}".format(e))