def handle_pytorch_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)

    cluster, is_distributed, = spawner.spec.cluster_def
    worker_resources = PytorchSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed
    )

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   resources=worker_resources.get(i))
def handle_base_experiment(experiment, spawner, response):
    # Default case only master was created by the experiment spawner
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources)
def start_notebook(notebook):
    # Update job status to show that its started
    notebook.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=notebook.build_job)
    except ValueError as e:
        _logger.warning('Could not start the notebook, %s', e)
        notebook.set_status(JobLifeCycle.FAILED, message='External git repo was note found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start notebook with built image `%s`', job_docker_image)

    spawner = NotebookSpawner(
        project_name=notebook.project.unique_name,
        project_uuid=notebook.project.uuid.hex,
        job_name=notebook.unique_name,
        job_uuid=notebook.uuid.hex,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        in_cluster=True)

    try:
        results = spawner.start_notebook(image=job_docker_image,
                                         resources=notebook.resources,
                                         node_selectors=notebook.node_selectors)
    except ApiException as e:
        _logger.warning('Could not start notebook, please check your polyaxon spec %s', e)
        notebook.set_status(
            JobLifeCycle.FAILED,
            message='Could not start notebook, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        _logger.warning('Could not start notebook, please check your polyaxon spec %s', e)
        notebook.set_status(
            JobLifeCycle.FAILED,
            message='Could not start notebook encountered an {} exception.'.format(
                e.__class__.__name__
            ))
        return
    notebook.definition = get_job_definition(results)
    notebook.save()
def start_dockerizer(build_job):
    # Update job status to show that its started
    build_job.set_status(JobLifeCycle.SCHEDULED)

    spawner = DockerizerSpawner(project_name=build_job.project.unique_name,
                                project_uuid=build_job.project.uuid.hex,
                                job_name=build_job.unique_name,
                                job_uuid=build_job.uuid.hex,
                                k8s_config=settings.K8S_CONFIG,
                                namespace=settings.K8S_NAMESPACE,
                                in_cluster=True)
    try:
        results = spawner.start_dockerizer(
            resources=build_job.resources,
            node_selectors=build_job.node_selectors)
        auditor.record(event_type=BUILD_JOB_STARTED,
                       instance=build_job,
                       target='project')
    except ApiException as e:
        _logger.warning(
            'Could not start build job, please check your polyaxon spec %s', e)
        build_job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start build job, encountered a Kubernetes ApiException.'
        )
        return False
    except Exception as e:
        _logger.warning(
            'Could not start build job, please check your polyaxon spec %s', e)
        build_job.set_status(
            JobLifeCycle.FAILED,
            message='Could not start build job encountered an {} exception.'.
            format(e.__class__.__name__))
        return False
    build_job.definition = get_job_definition(results)
    build_job.save()
    return True
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=settings.K8S_CONFIG,
                                 namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)

    try:
        results = spawner.start_tensorboard(
            image=tensorboard.image,
            resources=tensorboard.resources,
            node_selectors=tensorboard.node_selectors)
    except ApiException as e:
        _logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start tensorboard, encountered a Kubernetes ApiException.'
        )
        return
    except Exception as e:
        _logger.warning(
            'Could not start tensorboard, please check your polyaxon spec %s',
            e)
        tensorboard.set_status(
            JobLifeCycle.FAILED,
            message='Could not start tensorboard encountered an {} exception.'.
            format(e.__class__.__name__))
        return
    tensorboard.definition = get_job_definition(results)
    tensorboard.save()
Beispiel #6
0
def start_notebook(notebook):
    # Update job status to show that its started
    notebook.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=notebook.build_job)
    except (ValueError, AttributeError):
        _logger.error('Could not start the notebook.', exc_info=True)
        notebook.set_status(JobLifeCycle.FAILED,
                            message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start notebook with built image `%s`', job_docker_image)

    spawner = NotebookSpawner(project_name=notebook.project.unique_name,
                              project_uuid=notebook.project.uuid.hex,
                              job_name=notebook.unique_name,
                              job_uuid=notebook.uuid.hex,
                              k8s_config=conf.get('K8S_CONFIG'),
                              namespace=conf.get('K8S_NAMESPACE'),
                              job_docker_image=job_docker_image,
                              in_cluster=True)

    error = {}
    try:
        mount_code_in_notebooks = conf.get('MOUNT_CODE_IN_NOTEBOOKS')
        results = spawner.start_notebook(
            persistence_outputs=notebook.persistence_outputs,
            persistence_data=notebook.persistence_data,
            outputs_refs_jobs=notebook.outputs_refs_jobs,
            outputs_refs_experiments=notebook.outputs_refs_experiments,
            resources=notebook.resources,
            secret_refs=notebook.secret_refs,
            configmap_refs=notebook.configmap_refs,
            node_selector=notebook.node_selector,
            affinity=notebook.affinity,
            tolerations=notebook.tolerations,
            backend=notebook.backend,
            mount_code_in_notebooks=mount_code_in_notebooks)
        notebook.definition = get_job_definition(results)
        notebook.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error(
            'Could not start notebook, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the notebook, please check your volume definitions',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error(
            'Could not start notebook, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start notebook encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            notebook.set_status(JobLifeCycle.FAILED,
                                message=error.get('message'),
                                traceback=error.get('traceback'))
Beispiel #7
0
def start_dockerizer(build_job):
    # Update job status to show that its started
    build_job.set_status(JobLifeCycle.SCHEDULED)

    spawner = DockerizerSpawner(project_name=build_job.project.unique_name,
                                project_uuid=build_job.project.uuid.hex,
                                job_name=build_job.unique_name,
                                job_uuid=build_job.uuid.hex,
                                k8s_config=conf.get('K8S_CONFIG'),
                                namespace=conf.get('K8S_NAMESPACE'),
                                in_cluster=True)

    error = {}
    try:
        results = spawner.start_dockerizer(
            resources=build_job.resources,
            node_selector=build_job.node_selector,
            affinity=build_job.affinity,
            tolerations=build_job.tolerations)
        auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job)
        build_job.definition = get_job_definition(results)
        build_job.save(update_fields=['definition'])
        return True
    except ApiException:
        _logger.error(
            'Could not start build job, please check your polyaxon spec',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a Kubernetes ApiException.'
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start build job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a volume definition problem. %s'
            % e
        }
    except Exception as e:
        _logger.error(
            'Could not start build job, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            build_job.set_status(JobLifeCycle.FAILED,
                                 message=error.get('message'),
                                 traceback=error.get('traceback'))
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=job.build_job)
    except (ValueError, AttributeError):
        _logger.error('Could not start the job.', exc_info=True)
        job.set_status(JobLifeCycle.FAILED,
                       message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start job with built image `%s`', job_docker_image)

    spawner = JobSpawner(
        project_name=job.project.unique_name,
        project_uuid=job.project.uuid.hex,
        job_name=job.unique_name,
        job_uuid=job.uuid.hex,
        spec=job.specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        job_docker_image=job_docker_image,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))

    error = {}
    try:
        results = spawner.start_job(
            persistence_data=job.persistence_data,
            persistence_outputs=job.persistence_outputs,
            outputs_refs_jobs=job.outputs_refs_jobs,
            outputs_refs_experiments=job.outputs_refs_experiments,
            resources=job.resources,
            node_selector=job.node_selector,
            affinity=job.affinity,
            tolerations=job.tolerations)
        job.definition = get_job_definition(results)
        job.save()
        return
    except ApiException:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            job.set_status(JobLifeCycle.FAILED,
                           message=error.get('message'),
                           traceback=error.get('traceback'))
Beispiel #9
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=job.build_job)
    except ValueError as e:
        _logger.warning('Could not start the job, %s', e)
        job.set_status(JobLifeCycle.FAILED,
                       message='External git repo was note found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start job with built image `%s`', job_docker_image)

    spawner = JobSpawner(
        project_name=job.project.unique_name,
        project_uuid=job.project.uuid.hex,
        job_name=job.unique_name,
        job_uuid=job.uuid.hex,
        spec=job.specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        job_docker_image=job_docker_image,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))

    try:
        results = spawner.start_job(
            persistence_data=job.persistence_data,
            persistence_outputs=job.persistence_outputs,
            outputs_refs_jobs=job.outputs_refs_jobs,
            outputs_refs_experiments=job.outputs_refs_experiments,
            resources=job.resources,
            node_selectors=job.node_selectors)
    except ApiException as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start job, encountered a Kubernetes ApiException.')
        return
    except VolumeNotFoundError as e:
        _logger.warning(
            'Could not start the job, please check your volume definitions %s',
            e)
        job.set_status(JobLifeCycle.FAILED,
                       message='Could not start the job, '
                       'encountered a volume definition problem. %s' % e)
        return False
    except Exception as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message='Could not start job encountered an {} exception.'.format(
                e.__class__.__name__))
        return
    job.definition = get_job_definition(results)
    job.save()
Beispiel #10
0
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=settings.K8S_CONFIG,
                                 namespace=settings.K8S_NAMESPACE,
                                 in_cluster=True)

    error = {}
    try:
        results = spawner.start_tensorboard(
            image=tensorboard.image,
            outputs_path=tensorboard.outputs_path,
            persistence_outputs=tensorboard.persistence_outputs,
            outputs_refs_jobs=tensorboard.outputs_refs_jobs,
            outputs_refs_experiments=tensorboard.outputs_refs_experiments,
            resources=tensorboard.resources,
            node_selector=tensorboard.node_selector,
            affinity=tensorboard.affinity,
            tolerations=tensorboard.tolerations)
        tensorboard.definition = get_job_definition(results)
        tensorboard.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the tensorboard, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start tensorboard encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            tensorboard.set_status(JobLifeCycle.FAILED,
                                   message=error.get('message'),
                                   traceback=error.get('traceback'))
def handle_base_experiment(response):
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    set_job_definition(job_uuid=job_uuid, definition=get_job_definition(master))
def start_tensorboard(tensorboard):
    # Update job status to show that its started
    tensorboard.set_status(JobLifeCycle.SCHEDULED)

    spawner = TensorboardSpawner(project_name=tensorboard.project.unique_name,
                                 project_uuid=tensorboard.project.uuid.hex,
                                 job_name=tensorboard.unique_name,
                                 job_uuid=tensorboard.uuid.hex,
                                 k8s_config=conf.get(K8S_CONFIG),
                                 namespace=conf.get(K8S_NAMESPACE),
                                 job_docker_image=tensorboard.build_image,
                                 in_cluster=True)

    error = {}
    outputs_specs, tensorboard_paths = tensorboard.outputs_path
    try:
        results = spawner.start_tensorboard(
            outputs_path=tensorboard_paths,
            persistence_outputs=tensorboard.persistence_outputs,
            outputs_specs=outputs_specs,
            outputs_refs_jobs=tensorboard.outputs_refs_jobs,
            outputs_refs_experiments=tensorboard.outputs_refs_experiments,
            resources=tensorboard.resources,
            labels=tensorboard.labels,
            annotations=tensorboard.annotations,
            node_selector=tensorboard.node_selector,
            affinity=tensorboard.affinity,
            tolerations=tensorboard.tolerations,
            max_restarts=get_max_restart(tensorboard.max_restarts,
                                         conf.get(MAX_RESTARTS_TENSORBOARDS)),
            reconcile_url=get_tensorboard_reconcile_url(
                tensorboard.unique_name))
        tensorboard.definition = get_job_definition(results)
        tensorboard.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except StoreNotFoundError as e:
        _logger.error(
            'Could not start the tensorboard, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except TensorboardValidation as e:
        _logger.error(
            'Could not start the tensorboard, '
            'some experiments require authenticating to stores with different access.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            None,
            'message':
            'Could not start the tensorboard, '
            'some experiments require authenticating '
            'to stores with different access. %s' % e,
        }
    except Exception as e:
        _logger.error(
            'Could not start tensorboard, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start tensorboard encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            tensorboard.set_status(JobLifeCycle.FAILED,
                                   message=error.get('message'),
                                   traceback=error.get('traceback'))
Beispiel #13
0
def start_notebook(notebook):
    # Update job status to show that its started
    notebook.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=notebook.build_job)
    except (ValueError, AttributeError):
        _logger.error('Could not start the notebook.', exc_info=True)
        notebook.set_status(JobLifeCycle.FAILED,
                            message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start notebook with built image `%s`', job_docker_image)

    spawner = NotebookSpawner(project_name=notebook.project.unique_name,
                              project_uuid=notebook.project.uuid.hex,
                              job_name=notebook.unique_name,
                              job_uuid=notebook.uuid.hex,
                              k8s_config=settings.K8S_CONFIG,
                              namespace=settings.K8S_NAMESPACE,
                              in_cluster=True)

    try:
        allow_commits = False
        if settings.REPOS_CLAIM_NAME or notebook.node_selector:
            allow_commits = True
        results = spawner.start_notebook(
            image=job_docker_image,
            persistence_outputs=notebook.persistence_outputs,
            persistence_data=notebook.persistence_data,
            outputs_refs_jobs=notebook.outputs_refs_jobs,
            outputs_refs_experiments=notebook.outputs_refs_experiments,
            resources=notebook.resources,
            node_selector=notebook.node_selector,
            affinity=notebook.affinity,
            tolerations=notebook.tolerations,
            allow_commits=allow_commits)
    except ApiException:
        _logger.error(
            'Could not start notebook, please check your polyaxon spec.',
            exc_info=True)
        notebook.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start notebook, encountered a Kubernetes ApiException.')
        return
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the notebook, please check your volume definitions',
            exc_info=True)
        notebook.set_status(JobLifeCycle.FAILED,
                            message='Could not start the notebook, '
                            'encountered a volume definition problem. %s' % e)
        return False
    except Exception as e:
        _logger.error(
            'Could not start notebook, please check your polyaxon spec.',
            exc_info=True)
        notebook.set_status(
            JobLifeCycle.FAILED,
            message='Could not start notebook encountered an {} exception.'.
            format(e.__class__.__name__))
        return
    notebook.definition = get_job_definition(results)
    notebook.save()
Beispiel #14
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        registry_spec = get_registry_context(build_backend=None)
    except ContainerRegistryError:
        job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start the job, please check your registry configuration.'
        )
        return

    try:
        image_name, image_tag = get_image_info(
            build_job=job.build_job, registry_host=registry_spec.host)
    except (ValueError, AttributeError):
        _logger.error('Could not start the job.', exc_info=True)
        job.set_status(JobLifeCycle.FAILED,
                       message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start job with built image `%s`', job_docker_image)

    spawner = JobSpawner(project_name=job.project.unique_name,
                         project_uuid=job.project.uuid.hex,
                         job_name=job.unique_name,
                         job_uuid=job.uuid.hex,
                         k8s_config=conf.get(K8S_CONFIG),
                         namespace=conf.get(K8S_NAMESPACE),
                         job_docker_image=job_docker_image,
                         in_cluster=True,
                         use_sidecar=True,
                         log_level=job.specification.log_level)

    error = {}
    try:
        results = spawner.start_job(
            container_cmd_callback=job.specification.run.get_container_cmd,
            persistence_data=job.persistence_data,
            persistence_outputs=job.persistence_outputs,
            outputs_refs_jobs=job.outputs_refs_jobs,
            outputs_refs_experiments=job.outputs_refs_experiments,
            secret_refs=job.secret_refs,
            config_map_refs=job.config_map_refs,
            resources=job.resources,
            labels=job.labels,
            annotations=job.annotations,
            node_selector=job.node_selector,
            affinity=job.affinity,
            tolerations=job.tolerations,
            max_restarts=get_max_restart(job.max_restarts,
                                         conf.get(MAX_RESTARTS_JOBS)),
            reconcile_url=get_job_reconcile_url(job.unique_name))
        job.definition = get_job_definition(results)
        job.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except StoreNotFoundError as e:
        _logger.error(
            'Could not start the job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            job.set_status(JobLifeCycle.FAILED,
                           message=error.get('message'),
                           traceback=error.get('traceback'))
Beispiel #15
0
def start_dockerizer(build_job):
    # Update job status to show that its started
    build_job.set_status(JobLifeCycle.SCHEDULED)
    spawner_class = get_spawner_class(build_job.backend)

    local_build = build_job.backend in {BuildBackend.NATIVE, None}

    spawner = spawner_class(
        project_name=build_job.project.unique_name,
        project_uuid=build_job.project.uuid.hex,
        job_name=build_job.unique_name,
        job_uuid=build_job.uuid.hex,
        commit=build_job.commit,
        from_image=build_job.build_image,
        dockerfile_path=build_job.build_dockerfile,
        context_path=build_job.build_context,
        image_tag=build_job.uuid.hex,
        image_name=get_image_name(build_job, local=local_build),
        build_steps=build_job.build_steps,
        env_vars=build_job.build_env_vars,
        nocache=build_job.build_nocache,
        in_cluster_registry=conf.get('REGISTRY_IN_CLUSTER'),
        spec=build_job.specification,
        k8s_config=conf.get('K8S_CONFIG'),
        namespace=conf.get('K8S_NAMESPACE'),
        in_cluster=True,
        use_sidecar=True)

    error = {}
    try:
        results = spawner.start_dockerizer(
            resources=build_job.resources,
            node_selector=build_job.node_selector,
            affinity=build_job.affinity,
            tolerations=build_job.tolerations)
        auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job)
        build_job.definition = get_job_definition(results)
        build_job.save(update_fields=['definition'])
        return True
    except ApiException:
        _logger.error(
            'Could not start build job, please check your polyaxon spec',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a Kubernetes ApiException.'
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start build job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a volume definition problem. %s'
            % e
        }
    except Exception as e:
        _logger.error(
            'Could not start build job, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            build_job.set_status(JobLifeCycle.FAILED,
                                 message=error.get('message'),
                                 traceback=error.get('traceback'))
def handle_tensorflow_experiment(experiment, spawner, response):
    # Get the number of jobs this experiment started
    master = response[TaskType.MASTER]
    job_uuid = master['pod']['metadata']['labels']['job_uuid']
    job_uuid = uuid.UUID(job_uuid)

    create_job(job_uuid=job_uuid,
               experiment=experiment,
               definition=get_job_definition(master),
               resources=spawner.spec.master_resources,
               node_selector=spawner.spec.master_node_selector,
               affinity=spawner.spec.master_affinity,
               tolerations=spawner.spec.master_tolerations)

    cluster, is_distributed = spawner.spec.cluster_def

    worker_resources = TensorflowSpecification.get_worker_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_node_selectors = TensorflowSpecification.get_worker_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_affinities = TensorflowSpecification.get_worker_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    worker_tolerations = TensorflowSpecification.get_worker_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, worker in enumerate(response[TaskType.WORKER]):
        job_uuid = worker['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(worker),
                   role=TaskType.WORKER,
                   sequence=i,
                   resources=worker_resources.get(i),
                   node_selector=worker_node_selectors.get(i),
                   affinity=worker_affinities.get(i),
                   tolerations=worker_tolerations.get(i))

    ps_resources = TensorflowSpecification.get_ps_resources(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_node_selectors = TensorflowSpecification.get_ps_node_selectors(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_affinities = TensorflowSpecification.get_ps_affinities(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)
    ps_tolerations = TensorflowSpecification.get_ps_tolerations(
        environment=spawner.spec.environment,
        cluster=cluster,
        is_distributed=is_distributed)

    for i, ps in enumerate(response[TaskType.PS]):
        job_uuid = ps['pod']['metadata']['labels']['job_uuid']
        job_uuid = uuid.UUID(job_uuid)
        create_job(job_uuid=job_uuid,
                   experiment=experiment,
                   definition=get_job_definition(ps),
                   role=TaskType.PS,
                   sequence=i,
                   resources=ps_resources.get(i),
                   node_selector=ps_node_selectors.get(i),
                   affinity=ps_affinities.get(i),
                   tolerations=ps_tolerations.get(i))
Beispiel #17
0
def start_dockerizer(build_job):
    # Update job status to show that its started
    build_job.set_status(JobLifeCycle.SCHEDULED)
    spawner_class = get_spawner_class(build_job.backend)

    try:
        registry_spec = get_registry_context(build_backend=build_job.backend)
    except ContainerRegistryError:
        build_job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start the dockerizer job, please check your registry configuration.'
        )
        return

    spawner = spawner_class(project_name=build_job.project.unique_name,
                            project_uuid=build_job.project.uuid.hex,
                            job_name=build_job.unique_name,
                            job_uuid=build_job.uuid.hex,
                            commit=build_job.commit,
                            from_image=build_job.build_image,
                            dockerfile_path=build_job.build_dockerfile,
                            context_path=build_job.build_context,
                            image_tag=build_job.uuid.hex,
                            image_name=get_image_name(
                                build_job=build_job,
                                registry_host=registry_spec.host),
                            build_steps=build_job.build_steps,
                            env_vars=build_job.build_env_vars,
                            lang_env=build_job.build_lang_env,
                            nocache=build_job.build_nocache,
                            insecure=registry_spec.insecure,
                            creds_secret_ref=registry_spec.secret,
                            creds_secret_items=registry_spec.secret_items,
                            k8s_config=conf.get(K8S_CONFIG),
                            namespace=conf.get(K8S_NAMESPACE),
                            in_cluster=True,
                            use_sidecar=True,
                            log_level=build_job.specification.log_level)

    error = {}
    try:
        results = spawner.start_dockerizer(
            secret_refs=build_job.secret_refs,
            config_map_refs=build_job.config_map_refs,
            resources=build_job.resources,
            labels=build_job.labels,
            annotations=build_job.annotations,
            node_selector=build_job.node_selector,
            affinity=build_job.affinity,
            tolerations=build_job.tolerations,
            max_restarts=get_max_restart(build_job.max_restarts,
                                         conf.get(MAX_RESTARTS_BUILD_JOBS)),
            reconcile_url=get_build_reconcile_url(build_job.unique_name))
        auditor.record(event_type=BUILD_JOB_STARTED, instance=build_job)
        build_job.definition = get_job_definition(results)
        build_job.save(update_fields=['definition'])
        return True
    except ApiException:
        _logger.error(
            'Could not start build job, please check your polyaxon spec',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a Kubernetes ApiException.'
        }
    except StoreNotFoundError as e:
        _logger.error(
            'Could not start build job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job, encountered a volume definition problem. %s'
            % e
        }
    except Exception as e:
        _logger.error(
            'Could not start build job, please check your polyaxon spec.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start build job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            build_job.set_status(JobLifeCycle.FAILED,
                                 message=error.get('message'),
                                 traceback=error.get('traceback'))