Ejemplo n.º 1
0
def run(k8s_manager: 'K8SManager') -> None:
    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=conf.get('K8S_NAMESPACE'),
            container_names=(conf.get('CONTAINER_NAME_EXPERIMENT_JOB'),
                             conf.get('CONTAINER_NAME_PLUGIN_JOB'),
                             conf.get('CONTAINER_NAME_JOB'),
                             conf.get('CONTAINER_NAME_DOCKERIZER_JOB')),
            label_selector=get_label_selector(),
            return_event=True,
            watch_ttl=conf.get('TTL_WATCH_STATUSES')):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)
        experiment_job_condition = (
            conf.get('CONTAINER_NAME_EXPERIMENT_JOB')
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == conf.get('APP_LABELS_EXPERIMENT')))

        job_condition = (conf.get('CONTAINER_NAME_JOB')
                         in pod_state['details']['container_statuses']
                         or (status
                             and labels['app'] == conf.get('APP_LABELS_JOB')))

        plugin_job_condition = (
            conf.get('CONTAINER_NAME_PLUGIN_JOB')
            in pod_state['details']['container_statuses'] or
            (status and labels['app'] in (conf.get('APP_LABELS_TENSORBOARD'),
                                          conf.get('APP_LABELS_NOTEBOOK'))))

        dockerizer_job_condition = (
            conf.get('CONTAINER_NAME_DOCKERIZER_JOB')
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == conf.get('APP_LABELS_DOCKERIZER')))

        if experiment_job_condition:
            update_job_containers(event_object, status,
                                  conf.get('CONTAINER_NAME_EXPERIMENT_JOB'))
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle experiment job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                kwargs={'payload': pod_state})

        elif job_condition:
            update_job_containers(event_object, status,
                                  conf.get('CONTAINER_NAME_JOB'))
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle experiment job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                kwargs={'payload': pod_state})

        elif plugin_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle plugin job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                kwargs={'payload': pod_state})

        elif dockerizer_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle dockerizer job statuses
            celery_app.send_task(
                K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                kwargs={'payload': pod_state})
        else:
            logger.info("Lost state %s, %s", status, pod_state)
Ejemplo n.º 2
0
def run(k8s_manager: 'K8SManager') -> None:
    # pylint:disable=too-many-branches
    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=conf.get(K8S_NAMESPACE),
            container_names=(conf.get(CONTAINER_NAME_EXPERIMENT_JOBS),
                             conf.get(CONTAINER_NAME_TF_JOBS),
                             conf.get(CONTAINER_NAME_PYTORCH_JOBS),
                             conf.get(CONTAINER_NAME_PLUGIN_JOBS),
                             conf.get(CONTAINER_NAME_JOBS),
                             conf.get(CONTAINER_NAME_BUILD_JOBS)),
            label_selector=get_label_selector(),
            return_event=True,
            watch_ttl=conf.get(TTL_WATCH_STATUSES)):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)

        experiment_condition = status and labels['app'] == conf.get(
            APP_LABELS_EXPERIMENT)

        experiment_job_condition = (
            conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)
            in pod_state['details']['container_statuses']
            or 'job_uuid' in labels)

        tf_job_condition = (conf.get(CONTAINER_NAME_TF_JOBS)
                            in pod_state['details']['container_statuses']
                            or 'tf-replica-index' in labels)

        mpi_job_condition = 'mpi_job_name' in labels

        pytorch_job_condition = (conf.get(CONTAINER_NAME_PYTORCH_JOBS)
                                 in pod_state['details']['container_statuses']
                                 or 'pytroch-replica-index' in labels)

        job_condition = (conf.get(CONTAINER_NAME_JOBS)
                         in pod_state['details']['container_statuses']
                         or (status
                             and labels['app'] == conf.get(APP_LABELS_JOB)))

        plugin_job_condition = (
            conf.get(CONTAINER_NAME_PLUGIN_JOBS)
            in pod_state['details']['container_statuses']
            or (status and labels['app'] in (conf.get(APP_LABELS_TENSORBOARD),
                                             conf.get(APP_LABELS_NOTEBOOK))))

        dockerizer_job_condition = (
            conf.get(CONTAINER_NAME_BUILD_JOBS)
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == conf.get(APP_LABELS_DOCKERIZER)))

        if experiment_condition:
            if tf_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['tf-replica-index'])
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_TF_JOBS))

            elif pytorch_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['pytorch-replica-index'])
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_PYTORCH_JOBS))

            elif mpi_job_condition:
                job_name = pod_state['details']['pod_name']
                parts = job_name.split('-')
                if len(parts) != 4:
                    continue

                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=parts[-1])

                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))

            elif experiment_job_condition:
                handle_experiment_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=conf.get(CONTAINER_NAME_EXPERIMENT_JOBS))

        elif job_condition:
            update_job_containers(event_object, status,
                                  conf.get(CONTAINER_NAME_JOBS))
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)

        elif plugin_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle plugin job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)

        elif dockerizer_job_condition:
            logger.debug("Sending state to handler %s, %s", status, labels)
            # Handle dockerizer job statuses
            if should_handle_job_status(pod_state=pod_state, status=status):
                workers.send(
                    K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                    kwargs={'payload': pod_state},
                    countdown=None)
        else:
            logger.info("Lost state %s, %s", status, pod_state)
Ejemplo n.º 3
0
def run(k8s_manager: 'K8SManager') -> None:
    # pylint:disable=too-many-branches

    # Local cache
    label_selector = get_label_selector()
    container_name_experiment_job = conf.get(CONTAINER_NAME_EXPERIMENT_JOBS)
    container_name_tf_job = conf.get(CONTAINER_NAME_TF_JOBS)
    container_name_pytorch_job = conf.get(CONTAINER_NAME_PYTORCH_JOBS)
    container_name_plugin_job = conf.get(CONTAINER_NAME_PLUGIN_JOBS)
    container_name_job = conf.get(CONTAINER_NAME_JOBS)
    container_name_build_job = conf.get(CONTAINER_NAME_BUILD_JOBS)
    watch_ttl = conf.get(TTL_WATCH_STATUSES)
    app_labels_experiment = conf.get(APP_LABELS_EXPERIMENT)
    app_labels_job = conf.get(APP_LABELS_JOB)
    app_labels_build_job = conf.get(APP_LABELS_DOCKERIZER)
    app_labels_tensorboard = conf.get(APP_LABELS_TENSORBOARD)
    app_labels_notebook = conf.get(APP_LABELS_NOTEBOOK)

    for (event_object, pod_state) in ocular.monitor(
            k8s_manager.k8s_api,
            namespace=k8s_manager.namespace,
            container_names=(container_name_experiment_job,
                             container_name_tf_job, container_name_pytorch_job,
                             container_name_plugin_job, container_name_job,
                             container_name_build_job),
            label_selector=label_selector,
            return_event=True,
            watch_ttl=watch_ttl):
        logger.debug('-------------------------------------------\n%s\n',
                     pod_state)
        if not pod_state:
            continue

        status = pod_state['status']
        labels = None
        if pod_state['details'] and pod_state['details']['labels']:
            labels = pod_state['details']['labels']
        logger.info("Updating job container %s, %s", status, labels)

        experiment_condition = status and labels['app'] == app_labels_experiment

        experiment_job_condition = (
            container_name_experiment_job
            in pod_state['details']['container_statuses']
            or 'job_uuid' in labels)

        tf_job_condition = (container_name_tf_job
                            in pod_state['details']['container_statuses']
                            or 'tf-replica-index' in labels)

        mpi_job_condition = 'mpi_job_name' in labels

        pytorch_job_condition = (container_name_pytorch_job
                                 in pod_state['details']['container_statuses']
                                 or 'pytroch-replica-index' in labels)

        job_condition = (container_name_job
                         in pod_state['details']['container_statuses']
                         or (status and labels['app'] == app_labels_job))

        plugin_job_condition = (
            container_name_plugin_job
            in pod_state['details']['container_statuses']
            or (status and labels['app']
                in (app_labels_tensorboard, app_labels_notebook)))

        dockerizer_job_condition = (
            container_name_build_job
            in pod_state['details']['container_statuses']
            or (status and labels['app'] == app_labels_build_job))

        if experiment_condition:
            if tf_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['tf-replica-index'])
                handle_job_condition(event_object=event_object,
                                     pod_state=pod_state,
                                     status=status,
                                     labels=labels,
                                     container_name=container_name_tf_job,
                                     task_name=K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     update_containers=False)

            elif pytorch_job_condition:
                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=labels['pytorch-replica-index'])
                handle_job_condition(event_object=event_object,
                                     pod_state=pod_state,
                                     status=status,
                                     labels=labels,
                                     container_name=container_name_pytorch_job,
                                     task_name=K8SEventsCeleryTasks.
                                     K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                                     update_containers=False)

            elif mpi_job_condition:
                job_name = pod_state['details']['pod_name']
                parts = job_name.split('-')
                if len(parts) != 4:
                    continue

                # We augment the payload with standard Polyaxon requirement
                pod_state['details']['labels'][
                    'job_uuid'] = get_experiment_job_uuid(
                        experiment_uuid=labels['experiment_uuid'],
                        task_type=labels['task_type'],
                        task_index=parts[-1])

                handle_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=container_name_experiment_job,
                    task_name=K8SEventsCeleryTasks.
                    K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                    update_containers=False)

            elif experiment_job_condition:
                handle_job_condition(
                    event_object=event_object,
                    pod_state=pod_state,
                    status=status,
                    labels=labels,
                    container_name=container_name_experiment_job,
                    task_name=K8SEventsCeleryTasks.
                    K8S_EVENTS_HANDLE_EXPERIMENT_JOB_STATUSES,
                    update_containers=False)

        elif job_condition:
            handle_job_condition(
                event_object=event_object,
                pod_state=pod_state,
                status=status,
                labels=labels,
                container_name=container_name_job,
                task_name=K8SEventsCeleryTasks.K8S_EVENTS_HANDLE_JOB_STATUSES,
                update_containers=False)

        elif plugin_job_condition:
            handle_job_condition(event_object=event_object,
                                 pod_state=pod_state,
                                 status=status,
                                 labels=labels,
                                 container_name=container_name_plugin_job,
                                 task_name=K8SEventsCeleryTasks.
                                 K8S_EVENTS_HANDLE_PLUGIN_JOB_STATUSES,
                                 update_containers=False)

        elif dockerizer_job_condition:
            handle_job_condition(event_object=event_object,
                                 pod_state=pod_state,
                                 status=status,
                                 labels=labels,
                                 container_name=container_name_build_job,
                                 task_name=K8SEventsCeleryTasks.
                                 K8S_EVENTS_HANDLE_BUILD_JOB_STATUSES,
                                 update_containers=False)
        else:
            logger.info("Lost state %s, %s", status, pod_state)
Ejemplo n.º 4
0
import ocular
from kubernetes import client

api_client = client.api_client.ApiClient(
    configuration=
    'c29d119df3b14fb7a82207f29c8a2156c505a5948f3e4dcba6229c92b35c9006')

for pod_state in ocular.monitor(
        api_client,
        namespace='polyaxon',
        container_names=(
            'plx-notebook-be90630d9d0740ada845276f0e3f70a4-749dc96cd-zr29h', ),
        label_selector='app in (workers,dashboard),type=runner'):
    print(pod_state)