Example #1
0
def stop_job(project_name, project_uuid, job_name, job_uuid):
    spawner = JobSpawner(project_name=project_name,
                         project_uuid=project_uuid,
                         job_name=job_name,
                         job_uuid=job_uuid,
                         k8s_config=conf.get(K8S_CONFIG),
                         namespace=conf.get(K8S_NAMESPACE),
                         in_cluster=True)

    return spawner.stop_job()
Example #2
0
def stop_job(project_name, project_uuid, job_name, job_uuid, specification):
    spawner = JobSpawner(project_name=project_name,
                         project_uuid=project_uuid,
                         job_name=job_name,
                         job_uuid=job_uuid,
                         spec=specification,
                         k8s_config=settings.K8S_CONFIG,
                         namespace=settings.K8S_NAMESPACE,
                         in_cluster=True)

    return spawner.stop_job()
Example #3
0
def stop_job(job, update_status=False):
    spawner = JobSpawner(project_name=job.project.unique_name,
                         project_uuid=job.project.uuid.hex,
                         job_name=job.unique_name,
                         job_uuid=job.uuid.hex,
                         spec=job.specification,
                         k8s_config=settings.K8S_CONFIG,
                         namespace=settings.K8S_NAMESPACE,
                         in_cluster=True)

    spawner.stop_job()
    if update_status:
        # Update experiment status to show that its stopped
        job.set_status(status=JobLifeCycle.STOPPED, message='Job was stopped')
Example #4
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=job.build_job)
    except ValueError as e:
        _logger.warning('Could not start the notebook, %s', e)
        job.set_status(JobLifeCycle.FAILED,
                       message='External git repo was note found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start notebook with built image `%s`', job_docker_image)

    spawner = JobSpawner(
        project_name=job.project.unique_name,
        project_uuid=job.project.uuid.hex,
        job_name=job.unique_name,
        job_uuid=job.uuid.hex,
        spec=job.specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        job_docker_image=job_docker_image,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))

    try:
        results = spawner.start_job(resources=job.resources,
                                    node_selectors=job.node_selectors)
    except ApiException as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start job, encountered a Kubernetes ApiException.')
        return
    except Exception as e:
        _logger.warning(
            'Could not start job, please check your polyaxon spec %s', e)
        job.set_status(
            JobLifeCycle.FAILED,
            message='Could not start job encountered an {} exception.'.format(
                e.__class__.__name__))
        return
    job.definition = get_job_definition(results)
    job.save()
Example #5
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        image_name, image_tag = get_image_info(build_job=job.build_job)
    except (ValueError, AttributeError):
        _logger.error('Could not start the job.', exc_info=True)
        job.set_status(JobLifeCycle.FAILED,
                       message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start job with built image `%s`', job_docker_image)

    spawner = JobSpawner(
        project_name=job.project.unique_name,
        project_uuid=job.project.uuid.hex,
        job_name=job.unique_name,
        job_uuid=job.uuid.hex,
        spec=job.specification,
        k8s_config=settings.K8S_CONFIG,
        namespace=settings.K8S_NAMESPACE,
        job_docker_image=job_docker_image,
        in_cluster=True,
        use_sidecar=True,
        sidecar_config=config.get_requested_params(to_str=True))

    error = {}
    try:
        results = spawner.start_job(
            persistence_data=job.persistence_data,
            persistence_outputs=job.persistence_outputs,
            outputs_refs_jobs=job.outputs_refs_jobs,
            outputs_refs_experiments=job.outputs_refs_experiments,
            resources=job.resources,
            node_selector=job.node_selector,
            affinity=job.affinity,
            tolerations=job.tolerations)
        job.definition = get_job_definition(results)
        job.save()
        return
    except ApiException:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except VolumeNotFoundError as e:
        _logger.error(
            'Could not start the job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            job.set_status(JobLifeCycle.FAILED,
                           message=error.get('message'),
                           traceback=error.get('traceback'))
Example #6
0
def start_job(job):
    # Update job status to show that its started
    job.set_status(JobLifeCycle.SCHEDULED)

    try:
        registry_spec = get_registry_context(build_backend=None)
    except ContainerRegistryError:
        job.set_status(
            JobLifeCycle.FAILED,
            message=
            'Could not start the job, please check your registry configuration.'
        )
        return

    try:
        image_name, image_tag = get_image_info(
            build_job=job.build_job, registry_host=registry_spec.host)
    except (ValueError, AttributeError):
        _logger.error('Could not start the job.', exc_info=True)
        job.set_status(JobLifeCycle.FAILED,
                       message='Image info was not found.')
        return
    job_docker_image = '{}:{}'.format(image_name, image_tag)
    _logger.info('Start job with built image `%s`', job_docker_image)

    spawner = JobSpawner(project_name=job.project.unique_name,
                         project_uuid=job.project.uuid.hex,
                         job_name=job.unique_name,
                         job_uuid=job.uuid.hex,
                         k8s_config=conf.get(K8S_CONFIG),
                         namespace=conf.get(K8S_NAMESPACE),
                         job_docker_image=job_docker_image,
                         in_cluster=True,
                         use_sidecar=True,
                         log_level=job.specification.log_level)

    error = {}
    try:
        results = spawner.start_job(
            container_cmd_callback=job.specification.run.get_container_cmd,
            persistence_data=job.persistence_data,
            persistence_outputs=job.persistence_outputs,
            outputs_refs_jobs=job.outputs_refs_jobs,
            outputs_refs_experiments=job.outputs_refs_experiments,
            secret_refs=job.secret_refs,
            config_map_refs=job.config_map_refs,
            resources=job.resources,
            labels=job.labels,
            annotations=job.annotations,
            node_selector=job.node_selector,
            affinity=job.affinity,
            tolerations=job.tolerations,
            max_restarts=get_max_restart(job.max_restarts,
                                         conf.get(MAX_RESTARTS_JOBS)),
            reconcile_url=get_job_reconcile_url(job.unique_name))
        job.definition = get_job_definition(results)
        job.save(update_fields=['definition'])
        return
    except ApiException:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a Kubernetes ApiException.',
        }
    except StoreNotFoundError as e:
        _logger.error(
            'Could not start the job, please check your volume definitions.',
            exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start the job, encountered a volume definition problem. %s'
            % e,
        }
    except Exception as e:
        _logger.error('Could not start job, please check your polyaxon spec.',
                      exc_info=True)
        error = {
            'raised':
            True,
            'traceback':
            traceback.format_exc(),
            'message':
            'Could not start job encountered an {} exception.'.format(
                e.__class__.__name__)
        }
    finally:
        if error.get('raised'):
            job.set_status(JobLifeCycle.FAILED,
                           message=error.get('message'),
                           traceback=error.get('traceback'))