Beispiel #1
0
 def handle(self, *args, **options):
     log_sleep_interval = options['log_sleep_interval']
     self.stdout.write(
         "Started a new statuses monitor with, "
         "log sleep interval: `{}`.".format(log_sleep_interval),
         ending='\n')
     k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE,
                              in_cluster=True)
     while True:
         try:
             role_label = settings.ROLE_LABELS_WORKER
             type_label = settings.TYPE_LABELS_EXPERIMENT
             label_selector = 'role={},type={}'.format(
                 role_label, type_label)
             statuses.run(k8s_manager,
                          job_container_name=settings.JOB_CONTAINER_NAME,
                          experiment_type_label=type_label,
                          label_selector=label_selector)
         except ApiException as e:
             statuses.logger.error(
                 "Exception when calling CoreV1Api->list_namespaced_pod: %s\n"
                 % e)
             time.sleep(log_sleep_interval)
         except Exception as e:
             statuses.logger.exception("Unhandled exception occurred %s\n" %
                                       e)
Beispiel #2
0
    def handle(self, *args, **options):
        log_sleep_interval = options['log_sleep_interval']
        self.stdout.write(
            "Started a new namespace monitor with, "
            "log sleep interval: `{}`.".format(log_sleep_interval),
            ending='\n')
        k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                                 in_cluster=True)
        cluster = self.get_cluster_or_wait(log_sleep_interval)
        if not cluster:
            # End process
            return

        while True:
            try:
                monitor.run(k8s_manager, cluster)
            except ApiException as e:
                monitor.logger.error(
                    "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n",
                    e)
                time.sleep(log_sleep_interval)
            except ValueError as e:
                monitor.logger.error(
                    "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n",
                    e)
            except Exception as e:
                monitor.logger.exception("Unhandled exception occurred: %s\n",
                                         e)
Beispiel #3
0
 def handle(self, *args, **options) -> None:
     sleep_interval = options['sleep_interval']
     time.sleep(sleep_interval)
     self.stdout.write(
         "Started a new statuses monitor with, "
         "log sleep interval: `{}`.".format(sleep_interval),
         ending='\n')
     k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True)
     while True:
         try:
             monitor.run(k8s_manager)
         except (ApiException, ValueError, MaxRetryError) as e:
             monitor.logger.warning(
                 "Exception when calling CoreV1Api->list_namespaced_pod: %s\n", e)
             time.sleep(sleep_interval)
         except InterfaceError:
             # In some cases such as timeout, database restart, connection will
             # be closed by remote peer. Django cannot recover from this
             # condition automatically. Here we close dead connection manually,
             # make Django to reconnect next time querying DB.
             connection.close()
             monitor.logger.warning(
                 "Database connection is already closed by peer, discard old connection\n")
             time.sleep(sleep_interval)
         except Exception as e:
             monitor.logger.exception("Unhandled exception occurred %s\n", e)
Beispiel #4
0
def process_logs(build: 'BuildJob', temp: bool = True) -> None:
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True)
    log_lines = base.process_logs(k8s_manager=k8s_manager,
                                  pod_id=build.pod_id,
                                  container_job_name=conf.get('CONTAINER_NAME_DOCKERIZER_JOB'))

    safe_log_job(job_name=build.unique_name, log_lines=log_lines, temp=temp, append=False)
Beispiel #5
0
def update_system_info():
    k8s_manager = K8SManager(in_cluster=True)
    version_api = k8s_manager.get_version()
    cluster = Cluster.load()
    if cluster.version_api != version_api:
        cluster.version_api = version_api
        cluster.save()
Beispiel #6
0
def stream_logs(build: 'BuildJob') -> Iterable[str]:
    k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE),
                             in_cluster=True)
    return base.stream_logs(
        k8s_manager=k8s_manager,
        pod_id=build.pod_id,
        container_job_name=conf.get(CONTAINER_NAME_BUILD_JOBS))
Beispiel #7
0
    def handle(self, *args, **options):
        pod_id = options['pod_id']
        log_sleep_interval = options['log_sleep_interval']
        self.stdout.write(
            "Started a new jobs logs / sidecar monitor with, pod_id: `{}` container_job_name: `{}`"
            "log sleep interval: `{}`".format(
                pod_id, settings.CONTAINER_NAME_EXPERIMENT_JOB,
                log_sleep_interval),
            ending='\n')
        k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)
        is_running, labels = monitor.can_log(k8s_manager, pod_id,
                                             log_sleep_interval)
        if not is_running:
            monitor.logger.info('Job is not running anymore.')
            return

        monitor.run_for_experiment_job(
            k8s_manager=k8s_manager,
            pod_id=pod_id,
            experiment_uuid=labels.experiment_uuid.hex,
            experiment_name=labels.experiment_name,
            job_uuid=labels.job_uuid.hex,
            task_type=labels.task_type,
            task_idx=labels.task_idx,
            container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
        monitor.logger.info('Finished logging')
Beispiel #8
0
def stream_logs(pod_id, task_type, task_id):
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    return base.stream_logs(k8s_manager=k8s_manager,
                            pod_id=pod_id,
                            container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB,
                            task_type=task_type,
                            task_idx=task_id)
Beispiel #9
0
def process_logs(build, temp=True):
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    log_lines = base.process_logs(k8s_manager=k8s_manager,
                                  pod_id=build.pod_id,
                                  container_job_name=settings.CONTAINER_NAME_DOCKERIZER_JOB)

    safe_log_job(job_name=build.unique_name, log_lines=log_lines, temp=temp, append=False)
Beispiel #10
0
def update_system_nodes():

    k8s_manager = K8SManager(in_cluster=True)
    nodes = k8s_manager.list_nodes()
    cluster = Cluster.load()
    nodes_to_update = {}
    nodes_to_create = {node.metadata.name: node for node in nodes}
    deprecated_nodes = []
    for node in cluster.nodes.all():
        if node.name in nodes_to_create:
            nodes_to_update[node.name] = (node, nodes_to_create.pop(node.name))
        else:
            deprecated_nodes.append(node)

    for node in deprecated_nodes:
        node.is_current = False
        node.save()

    for node in nodes_to_create.values():
        node_dict = ClusterNode.from_node_item(node)
        node_dict['cluster'] = cluster
        ClusterNode.objects.create(**node_dict)

    for current_node, new_node in nodes_to_update.values():
        node_dict = ClusterNode.from_node_item(new_node)
        for k, v in node_dict.items():
            setattr(current_node, k, v)
            current_node.save()
Beispiel #11
0
def process_experiment_jobs_logs(experiment, temp=True):
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                             in_cluster=True)
    for experiment_job in experiment.jobs.all():
        process_experiment_job_logs(experiment_job=experiment_job,
                                    temp=temp,
                                    k8s_manager=k8s_manager)
Beispiel #12
0
def get_registry_host():
    if not hasattr(settings, 'REGISTRY_HOST'):
        k8s = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
        settings.REGISTRY_HOST = '{}:{}'.format(
            k8s.get_service(name=settings.REGISTRY_HOST_NAME).spec.cluster_ip,
            settings.REGISTRY_PORT)

    return settings.REGISTRY_HOST
Beispiel #13
0
def stream_logs(build):
    pod_id = JOB_NAME_FORMAT.format(name=DOCKERIZER_JOB_NAME,
                                    job_uuid=build.uuid.hex)
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    return base.stream_logs(
        k8s_manager=k8s_manager,
        pod_id=pod_id,
        container_job_name=settings.CONTAINER_NAME_DOCKERIZER_JOB)
Beispiel #14
0
def process_logs(job, temp=True):
    pod_id = JOB_NAME_FORMAT.format(name=JOB_NAME, job_uuid=job.uuid.hex)
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    log_lines = base.process_logs(k8s_manager=k8s_manager,
                                  pod_id=pod_id,
                                  container_job_name=settings.CONTAINER_NAME_JOB)

    safe_log_job(job_name=job.unique_name, log_lines=log_lines, temp=temp, append=False)
Beispiel #15
0
def stream_logs(pod_id: str, task_type: str, task_id: int) -> Iterable[str]:
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                             in_cluster=True)
    return base.stream_logs(
        k8s_manager=k8s_manager,
        pod_id=pod_id,
        container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'),
        task_type=task_type,
        task_idx=task_id)
Beispiel #16
0
def update_system_info():
    k8s_manager = K8SManager(in_cluster=True)
    version_api = k8s_manager.get_version()
    cluster = Cluster.load()
    if version_api and cluster.version_api != version_api:
        cluster.version_api = version_api
        cluster.save()
        auditor.record(event_type=CLUSTER_UPDATED,
                       instance=cluster,
                       is_upgrade=settings.CHART_IS_UPGRADE)
Beispiel #17
0
def main():
    pod_id = os.environ['POLYAXON_POD_ID']
    job_id = os.environ['POLYAXON_JOB_ID']
    k8s_manager = K8SManager(namespace=settings.NAMESPACE, in_cluster=True)
    can_log(k8s_manager, pod_id)
    # TODO: add experiment id and job id to the routing key
    publisher = Publisher(os.environ['POLYAXON_ROUTING_KEYS_LOGS_SIDECARS'],
                          content_type='text/plain')
    run(k8s_manager, publisher, pod_id, job_id)
    logger.debug('Finished logging')
Beispiel #18
0
def stream_logs(experiment):
    pod_id = EXPERIMENT_JOB_NAME_FORMAT.format(
        task_type=TaskType.MASTER,  # We default to master
        task_idx=0,
        experiment_uuid=experiment.uuid.hex)
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    return base.stream_logs(
        k8s_manager=k8s_manager,
        pod_id=pod_id,
        container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)
Beispiel #19
0
def stream_logs(experiment: 'Experiment') -> Iterable[str]:
    pod_id = EXPERIMENT_JOB_NAME_FORMAT.format(
        task_type=experiment.default_job_role,
        task_idx=0,
        experiment_uuid=experiment.uuid.hex)
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True)
    container_job_name = get_experiment_job_container_name(backend=experiment.backend,
                                                           framework=experiment.framework)
    return base.stream_logs(k8s_manager=k8s_manager,
                            pod_id=pod_id,
                            container_job_name=container_job_name)
Beispiel #20
0
def stream_logs(experiment: 'Experiment') -> Iterable[str]:
    pod_id = EXPERIMENT_JOB_NAME_FORMAT.format(
        task_type=TaskType.MASTER,  # We default to master
        task_idx=0,
        experiment_uuid=experiment.uuid.hex)
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                             in_cluster=True)
    return base.stream_logs(
        k8s_manager=k8s_manager,
        pod_id=pod_id,
        container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'))
Beispiel #21
0
def process_logs(job: 'Job', temp: bool = True) -> None:
    k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE),
                             in_cluster=True)
    log_lines = base.process_logs(
        k8s_manager=k8s_manager,
        pod_id=job.pod_id,
        container_job_name=conf.get(CONTAINER_NAME_JOBS))

    safe_log_job(job_name=job.unique_name,
                 log_lines=log_lines,
                 temp=temp,
                 append=False)
Beispiel #22
0
def process_logs(job, temp=True):
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                             in_cluster=True)
    log_lines = base.process_logs(
        k8s_manager=k8s_manager,
        pod_id=job.pod_id,
        container_job_name=conf.get('CONTAINER_NAME_JOB'))

    safe_log_job(job_name=job.unique_name,
                 log_lines=log_lines,
                 temp=temp,
                 append=False)
Beispiel #23
0
def main():
    k8s_manager = K8SManager(namespace=settings.NAMESPACE, in_cluster=True)
    publisher = Publisher(os.environ['POLYAXON_ROUTING_KEYS_EVENTS_NAMESPACE'])
    while True:
        try:
            run(k8s_manager, publisher)
        except ApiException as e:
            logger.error(
                "Exception when calling CoreV1Api->list_event_for_all_namespaces: %s\n"
                % e)
            time.sleep(settings.LOG_SLEEP_INTERVAL)
        except Exception as e:
            logger.exception("Unhandled exception occurred: %s\n" % e)
Beispiel #24
0
def update_system_nodes() -> None:
    k8s_manager = K8SManager(in_cluster=True)
    nodes = k8s_manager.list_nodes()
    cluster = Cluster.load()
    nodes_to_update = {}
    nodes_to_create = {node.metadata.name: node for node in nodes}
    deprecated_nodes = []
    for node in cluster.nodes.all():
        if node.name in nodes_to_create:
            nodes_to_update[node.name] = (node, nodes_to_create.pop(node.name))
        elif node.is_current:
            deprecated_nodes.append(node)

    cluster_updated = False
    for node in deprecated_nodes:
        node.is_current = False
        node.save()
        cluster_updated = True
        auditor.record(event_type=CLUSTER_NODE_DELETED, instance=node)

    for node in nodes_to_create.values():
        node_dict = ClusterNode.from_node_item(node)
        node_dict['cluster'] = cluster
        instance = ClusterNode.objects.create(**node_dict)
        cluster_updated = True
        auditor.record(event_type=CLUSTER_NODE_CREATED, instance=instance)

    for current_node, new_node in nodes_to_update.values():
        node_dict = ClusterNode.from_node_item(new_node)
        node_updated = False
        for k, v in node_dict.items():
            if v != getattr(current_node, k):
                setattr(current_node, k, v)
                node_updated = True
            if not current_node.is_current:
                current_node.is_current = True
                node_updated = True
        if node_updated:
            current_node.save()
            cluster_updated = True
            auditor.record(event_type=CLUSTER_NODE_UPDATED,
                           instance=current_node)

    if cluster_updated:
        cluster = get_cluster_resources()
        auditor.record(event_type=CLUSTER_RESOURCES_UPDATED,
                       instance=cluster,
                       n_nodes=cluster.n_nodes,
                       memory=round(cluster.memory / (1000**3), 2),
                       n_cpus=cluster.n_cpus,
                       n_gpus=cluster.n_gpus)
Beispiel #25
0
def process_logs(experiment, temp=True):
    pod_id = EXPERIMENT_JOB_NAME_FORMAT.format(
        task_type=TaskType.MASTER,  # We default to master
        task_idx=0,
        experiment_uuid=experiment.uuid.hex)
    k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
    log_lines = base.process_logs(
        k8s_manager=k8s_manager,
        pod_id=pod_id,
        container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB)

    safe_log_experiment(experiment_name=experiment.unique_name,
                        log_lines=log_lines,
                        temp=temp,
                        append=False)
Beispiel #26
0
def process_logs(experiment: 'Experiment', temp: bool = True) -> None:
    pod_id = EXPERIMENT_JOB_NAME_FORMAT.format(
        task_type=experiment.default_job_role,
        task_idx=0,
        experiment_uuid=experiment.uuid.hex)
    k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'), in_cluster=True)
    container_job_name = get_experiment_job_container_name(backend=experiment.backend,
                                                           framework=experiment.framework)
    log_lines = base.process_logs(k8s_manager=k8s_manager,
                                  pod_id=pod_id,
                                  container_job_name=container_job_name)

    safe_log_experiment(experiment_name=experiment.unique_name,
                        log_lines=log_lines,
                        temp=temp,
                        append=False)
Beispiel #27
0
 def handle(self, *args, **options):
     log_sleep_interval = options['log_sleep_interval']
     self.stdout.write(
         "Started a new statuses monitor with, "
         "log sleep interval: `{}`.".format(log_sleep_interval),
         ending='\n')
     k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE, in_cluster=True)
     while True:
         try:
             statuses.run(k8s_manager)
         except ApiException as e:
             statuses.logger.error(
                 "Exception when calling CoreV1Api->list_namespaced_pod: %s\n", e)
             time.sleep(log_sleep_interval)
         except Exception as e:
             statuses.logger.exception("Unhandled exception occurred %s\n", e)
Beispiel #28
0
def process_logs(experiment_job, temp=True, k8s_manager=None):
    task_type = experiment_job.role
    task_id = experiment_job.sequence
    if not k8s_manager:
        k8s_manager = K8SManager(namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)
    log_lines = base.process_logs(
        k8s_manager=k8s_manager,
        pod_id=experiment_job.pod_id,
        container_job_name=settings.CONTAINER_NAME_EXPERIMENT_JOB,
        task_type=task_type,
        task_idx=task_id)

    safe_log_experiment_job(experiment_job_name=experiment_job.unique_name,
                            log_lines=log_lines,
                            temp=temp,
                            append=False)
Beispiel #29
0
def process_logs(experiment_job: 'ExperimentJob',
                 temp: bool = True,
                 k8s_manager: 'K8SManager' = None) -> None:
    task_type = experiment_job.role
    task_id = experiment_job.sequence
    if not k8s_manager:
        k8s_manager = K8SManager(namespace=conf.get('K8S_NAMESPACE'),
                                 in_cluster=True)
    log_lines = base.process_logs(
        k8s_manager=k8s_manager,
        pod_id=experiment_job.pod_id,
        container_job_name=conf.get('CONTAINER_NAME_EXPERIMENT_JOB'),
        task_type=task_type,
        task_idx=task_id)

    safe_log_experiment_job(experiment_job_name=experiment_job.unique_name,
                            log_lines=log_lines,
                            temp=temp,
                            append=False)
Beispiel #30
0
def process_logs(experiment_job: 'ExperimentJob',
                 temp: bool = True,
                 k8s_manager: 'K8SManager' = None) -> None:
    task_type = experiment_job.role
    task_id = experiment_job.sequence
    if not k8s_manager:
        k8s_manager = K8SManager(namespace=conf.get(K8S_NAMESPACE), in_cluster=True)
    container_job_name = get_experiment_job_container_name(
        backend=experiment_job.experiment.backend,
        framework=experiment_job.experiment.framework)
    log_lines = base.process_logs(k8s_manager=k8s_manager,
                                  pod_id=experiment_job.pod_id,
                                  container_job_name=container_job_name,
                                  task_type=task_type,
                                  task_idx=task_id)

    safe_log_experiment_job(experiment_job_name=experiment_job.unique_name,
                            log_lines=log_lines,
                            temp=temp,
                            append=False)