def handle(self, *args, **options): log_sleep_interval = options['log_sleep_interval'] self.stdout.write( "Started a new statuses monitor with, " "log sleep interval: `{}`.".format(log_sleep_interval), ending='\n') k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) while True: try: role_label = settings.ROLE_LABELS_WORKER type_label = settings.TYPE_LABELS_EXPERIMENT label_selector = 'role={},type={}'.format( role_label, type_label) statuses.run(k8s_manager, job_container_name=settings.JOB_CONTAINER_NAME, experiment_type_label=type_label, label_selector=label_selector) except ApiException as e: statuses.logger.error( "Exception when calling CoreV1Api->list_namespaced_pod: %s\n" % e) time.sleep(log_sleep_interval) except Exception as e: statuses.logger.exception("Unhandled exception occurred %s\n" % e)
def handle(self, *args, **options): log_sleep_interval = options['log_sleep_interval'] self.stdout.write( "Started a new namespace monitor with, " "log sleep interval: `{}`.".format(log_sleep_interval), ending='\n') k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) cluster = self.get_cluster_or_wait(log_sleep_interval) if not cluster: # End process return while True: try: monitor.run(k8s_manager, cluster) except ApiException as e: monitor.logger.error( "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n", e) time.sleep(log_sleep_interval) except ValueError as e: monitor.logger.error( "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n", e) except Exception as e: monitor.logger.exception("Unhandled exception occurred: %s\n", e)
def handle(self, *args, **options) -> None: sleep_interval = options['sleep_interval'] time.sleep(sleep_interval) self.stdout.write( "Started a new statuses monitor with, " "log sleep interval: `{}`.".format(sleep_interval), ending='\n') k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True) while True: try: monitor.run(k8s_manager) except (ApiException, ValueError, MaxRetryError) as e: monitor.logger.warning( "Exception when calling CoreV1Api->list_namespaced_pod: %s\n", e) time.sleep(sleep_interval) except InterfaceError: # In some cases such as timeout, database restart, connection will # be closed by remote peer. Django cannot recover from this # condition automatically. Here we close dead connection manually, # make Django to reconnect next time querying DB. connection.close() monitor.logger.warning( "Database connection is already closed by peer, discard old connection\n") time.sleep(sleep_interval) except Exception as e: monitor.logger.exception("Unhandled exception occurred %s\n", e)
def process_logs(build: 'BuildJob', temp: bool = True) -> None: k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) log_lines = base.process_logs(k8s_manager=k8s_manager, pod_id=build.pod_id, container_job_name=conf.get('CONTAINER_NAME_DOCKERIZER_JOB')) safe_log_job(job_name=build.unique_name, log_lines=log_lines, temp=temp, append=False)
def update_system_info(): k8s_manager = K8SManager(in_cluster=True) version_api = k8s_manager.get_version() cluster = Cluster.load() if cluster.version_api != version_api: cluster.version_api = version_api cluster.save()
def stream_logs(build: 'BuildJob') -> Iterable[str]: k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True) return base.stream_logs( k8s_manager=k8s_manager, pod_id=build.pod_id, container_job_name=conf.get(CONTAINER_NAME_BUILD_JOBS))
def handle(self, *args, **options): pod_id = options['pod_id'] log_sleep_interval = options['log_sleep_interval'] self.stdout.write( "Started a new jobs logs / sidecar monitor with, pod_id: `{}` container_job_name: `{}`" "log sleep interval: `{}`".format( pod_id, settings.CONTAINER_NAME_EXPERIMENT_JOB, log_sleep_interval), ending='\n') k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) is_running, labels = monitor.can_log(k8s_manager, pod_id, log_sleep_interval) if not is_running: monitor.logger.info('Job is not running anymore.') return monitor.run_for_experiment_job( k8s_manager=k8s_manager, pod_id=pod_id, experiment_uuid=labels.experiment_uuid.hex, experiment_name=labels.experiment_name, job_uuid=labels.job_uuid.hex, task_type=labels.task_type, task_idx=labels.task_idx, container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) monitor.logger.info('Finished logging')
def stream_logs(pod_id, task_type, task_id): k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) return base.stream_logs(k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB, task_type=task_type, task_idx=task_id)
def process_logs(build, temp=True): k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) log_lines = base.process_logs(k8s_manager=k8s_manager, pod_id=build.pod_id, container_job_name=settings.CONTAINER_NAME_DOCKERIZER_JOB) safe_log_job(job_name=build.unique_name, log_lines=log_lines, temp=temp, append=False)
def update_system_nodes(): k8s_manager = K8SManager(in_cluster=True) nodes = k8s_manager.list_nodes() cluster = Cluster.load() nodes_to_update = {} nodes_to_create = {node.metadata.name: node for node in nodes} deprecated_nodes = [] for node in cluster.nodes.all(): if node.name in nodes_to_create: nodes_to_update[node.name] = (node, nodes_to_create.pop(node.name)) else: deprecated_nodes.append(node) for node in deprecated_nodes: node.is_current = False node.save() for node in nodes_to_create.values(): node_dict = ClusterNode.from_node_item(node) node_dict['cluster'] = cluster ClusterNode.objects.create(**node_dict) for current_node, new_node in nodes_to_update.values(): node_dict = ClusterNode.from_node_item(new_node) for k, v in node_dict.items(): setattr(current_node, k, v) current_node.save()
def process_experiment_jobs_logs(experiment, temp=True): k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) for experiment_job in experiment.jobs.all(): process_experiment_job_logs(experiment_job=experiment_job, temp=temp, k8s_manager=k8s_manager)
def get_registry_host(): if not hasattr(settings, 'REGISTRY_HOST'): k8s = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) settings.REGISTRY_HOST = '{}:{}'.format( k8s.get_service(name=settings.REGISTRY_HOST_NAME).spec.cluster_ip, settings.REGISTRY_PORT) return settings.REGISTRY_HOST
def stream_logs(build): pod_id = JOB_NAME_FORMAT.format(name=DOCKERIZER_JOB_NAME, job_uuid=build.uuid.hex) k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) return base.stream_logs( k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=settings.CONTAINER_NAME_DOCKERIZER_JOB)
def process_logs(job, temp=True): pod_id = JOB_NAME_FORMAT.format(name=JOB_NAME, job_uuid=job.uuid.hex) k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) log_lines = base.process_logs(k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=settings.CONTAINER_NAME_JOB) safe_log_job(job_name=job.unique_name, log_lines=log_lines, temp=temp, append=False)
def stream_logs(pod_id: str, task_type: str, task_id: int) -> Iterable[str]: k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) return base.stream_logs( k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'), task_type=task_type, task_idx=task_id)
def update_system_info(): k8s_manager = K8SManager(in_cluster=True) version_api = k8s_manager.get_version() cluster = Cluster.load() if version_api and cluster.version_api != version_api: cluster.version_api = version_api cluster.save() auditor.record(event_type=CLUSTER_UPDATED, instance=cluster, is_upgrade=settings.CHART_IS_UPGRADE)
def main(): pod_id = os.environ['POLYAXON_POD_ID'] job_id = os.environ['POLYAXON_JOB_ID'] k8s_manager = K8SManager(namespace=settings.NAMESPACE, in_cluster=True) can_log(k8s_manager, pod_id) # TODO: add experiment id and job id to the routing key publisher = Publisher(os.environ['POLYAXON_ROUTING_KEYS_LOGS_SIDECARS'], content_type='text/plain') run(k8s_manager, publisher, pod_id, job_id) logger.debug('Finished logging')
def stream_logs(experiment): pod_id = EXPERIMENT_JOB_NAME_FORMAT.format( task_type=TaskType.MASTER, # We default to master task_idx=0, experiment_uuid=experiment.uuid.hex) k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) return base.stream_logs( k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
def stream_logs(experiment: 'Experiment') -> Iterable[str]: pod_id = EXPERIMENT_JOB_NAME_FORMAT.format( task_type=experiment.default_job_role, task_idx=0, experiment_uuid=experiment.uuid.hex) k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) container_job_name = get_experiment_job_container_name(backend=experiment.backend, framework=experiment.framework) return base.stream_logs(k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=container_job_name)
def stream_logs(experiment: 'Experiment') -> Iterable[str]: pod_id = EXPERIMENT_JOB_NAME_FORMAT.format( task_type=TaskType.MASTER, # We default to master task_idx=0, experiment_uuid=experiment.uuid.hex) k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) return base.stream_logs( k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'))
def process_logs(job: 'Job', temp: bool = True) -> None: k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True) log_lines = base.process_logs( k8s_manager=k8s_manager, pod_id=job.pod_id, container_job_name=conf.get(CONTAINER_NAME_JOBS)) safe_log_job(job_name=job.unique_name, log_lines=log_lines, temp=temp, append=False)
def process_logs(job, temp=True): k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) log_lines = base.process_logs( k8s_manager=k8s_manager, pod_id=job.pod_id, container_job_name=conf.get('CONTAINER_NAME_JOB')) safe_log_job(job_name=job.unique_name, log_lines=log_lines, temp=temp, append=False)
def main(): k8s_manager = K8SManager(namespace=settings.NAMESPACE, in_cluster=True) publisher = Publisher(os.environ['POLYAXON_ROUTING_KEYS_EVENTS_NAMESPACE']) while True: try: run(k8s_manager, publisher) except ApiException as e: logger.error( "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n" % e) time.sleep(settings.LOG_SLEEP_INTERVAL) except Exception as e: logger.exception("Unhandled exception occurred: %s\n" % e)
def update_system_nodes() -> None: k8s_manager = K8SManager(in_cluster=True) nodes = k8s_manager.list_nodes() cluster = Cluster.load() nodes_to_update = {} nodes_to_create = {node.metadata.name: node for node in nodes} deprecated_nodes = [] for node in cluster.nodes.all(): if node.name in nodes_to_create: nodes_to_update[node.name] = (node, nodes_to_create.pop(node.name)) elif node.is_current: deprecated_nodes.append(node) cluster_updated = False for node in deprecated_nodes: node.is_current = False node.save() cluster_updated = True auditor.record(event_type=CLUSTER_NODE_DELETED, instance=node) for node in nodes_to_create.values(): node_dict = ClusterNode.from_node_item(node) node_dict['cluster'] = cluster instance = ClusterNode.objects.create(**node_dict) cluster_updated = True auditor.record(event_type=CLUSTER_NODE_CREATED, instance=instance) for current_node, new_node in nodes_to_update.values(): node_dict = ClusterNode.from_node_item(new_node) node_updated = False for k, v in node_dict.items(): if v != getattr(current_node, k): setattr(current_node, k, v) node_updated = True if not current_node.is_current: current_node.is_current = True node_updated = True if node_updated: current_node.save() cluster_updated = True auditor.record(event_type=CLUSTER_NODE_UPDATED, instance=current_node) if cluster_updated: cluster = get_cluster_resources() auditor.record(event_type=CLUSTER_RESOURCES_UPDATED, instance=cluster, n_nodes=cluster.n_nodes, memory=round(cluster.memory / (1000**3), 2), n_cpus=cluster.n_cpus, n_gpus=cluster.n_gpus)
def process_logs(experiment, temp=True): pod_id = EXPERIMENT_JOB_NAME_FORMAT.format( task_type=TaskType.MASTER, # We default to master task_idx=0, experiment_uuid=experiment.uuid.hex) k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) log_lines = base.process_logs( k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB) safe_log_experiment(experiment_name=experiment.unique_name, log_lines=log_lines, temp=temp, append=False)
def process_logs(experiment: 'Experiment', temp: bool = True) -> None: pod_id = EXPERIMENT_JOB_NAME_FORMAT.format( task_type=experiment.default_job_role, task_idx=0, experiment_uuid=experiment.uuid.hex) k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) container_job_name = get_experiment_job_container_name(backend=experiment.backend, framework=experiment.framework) log_lines = base.process_logs(k8s_manager=k8s_manager, pod_id=pod_id, container_job_name=container_job_name) safe_log_experiment(experiment_name=experiment.unique_name, log_lines=log_lines, temp=temp, append=False)
def handle(self, *args, **options): log_sleep_interval = options['log_sleep_interval'] self.stdout.write( "Started a new statuses monitor with, " "log sleep interval: `{}`.".format(log_sleep_interval), ending='\n') k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) while True: try: statuses.run(k8s_manager) except ApiException as e: statuses.logger.error( "Exception when calling CoreV1Api->list_namespaced_pod: %s\n", e) time.sleep(log_sleep_interval) except Exception as e: statuses.logger.exception("Unhandled exception occurred %s\n", e)
def process_logs(experiment_job, temp=True, k8s_manager=None): task_type = experiment_job.role task_id = experiment_job.sequence if not k8s_manager: k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True) log_lines = base.process_logs( k8s_manager=k8s_manager, pod_id=experiment_job.pod_id, container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB, task_type=task_type, task_idx=task_id) safe_log_experiment_job(experiment_job_name=experiment_job.unique_name, log_lines=log_lines, temp=temp, append=False)
def process_logs(experiment_job: 'ExperimentJob', temp: bool = True, k8s_manager: 'K8SManager' = None) -> None: task_type = experiment_job.role task_id = experiment_job.sequence if not k8s_manager: k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True) log_lines = base.process_logs( k8s_manager=k8s_manager, pod_id=experiment_job.pod_id, container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'), task_type=task_type, task_idx=task_id) safe_log_experiment_job(experiment_job_name=experiment_job.unique_name, log_lines=log_lines, temp=temp, append=False)
def process_logs(experiment_job: 'ExperimentJob', temp: bool = True, k8s_manager: 'K8SManager' = None) -> None: task_type = experiment_job.role task_id = experiment_job.sequence if not k8s_manager: k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True) container_job_name = get_experiment_job_container_name( backend=experiment_job.experiment.backend, framework=experiment_job.experiment.framework) log_lines = base.process_logs(k8s_manager=k8s_manager, pod_id=experiment_job.pod_id, container_job_name=container_job_name, task_type=task_type, task_idx=task_id) safe_log_experiment_job(experiment_job_name=experiment_job.unique_name, log_lines=log_lines, temp=temp, append=False)