Example #1
0
def delete_complete_jobs(api: CoreV1Api, batch_api: BatchV1Api, namespace: str):
    """Delete complete jobs."""
    for job in batch_api.list_namespaced_job(namespace).items:
        if (
            job.status.conditions
            and job.status.conditions[0].type == "Complete"
            and not job.metadata.deletion_timestamp
            and _is_flush_job(job)
        ):
            logger.info(f"deleting complete job: {job.metadata.name}")
            # configure persistent volume claims to be deleted with the job
            pv_name = _pv_name_from_job(job)
            logger.info(f"including pv in pvc delete: {pv_name}")
            api.patch_persistent_volume(
                name=pv_name,
                body=V1PersistentVolume(
                    spec=V1PersistentVolumeSpec(
                        persistent_volume_reclaim_policy="Delete",
                    )
                ),
            )
            logger.info(f"including pvc in job delete: {job.metadata.name}")
            api.patch_namespaced_persistent_volume_claim(
                name=job.metadata.name,
                namespace=namespace,
                body=V1PersistentVolumeClaim(
                    metadata=V1ObjectMeta(
                        owner_references=[
                            V1OwnerReference(
                                api_version="batch/v1",
                                kind="Job",
                                name=job.metadata.name,
                                uid=job.metadata.uid,
                                block_owner_deletion=True,
                            )
                        ]
                    )
                ),
            )
            try:
                batch_api.delete_namespaced_job(
                    name=job.metadata.name,
                    namespace=namespace,
                    body=V1DeleteOptions(
                        grace_period_seconds=0,
                        propagation_policy="Foreground",
                        preconditions=V1Preconditions(
                            resource_version=job.metadata.resource_version,
                            uid=job.metadata.uid,
                        ),
                    ),
                )
            except ApiException as e:
                if e.reason not in (CONFLICT, NOT_FOUND):
                    raise
                logger.info(f"job already deleted or updated: {job.metadata.name}")
Example #2
0
def _create_flush_job(
    batch_api: BatchV1Api,
    command: List[str],
    env: List[V1EnvVar],
    image: str,
    name: str,
    namespace: str,
    service_account_name: str,
) -> V1Job:
    logger.info(f"creating job: {name}")
    try:
        return batch_api.create_namespaced_job(
            namespace=namespace,
            body=V1Job(
                api_version="batch/v1",
                kind="Job",
                metadata=V1ObjectMeta(name=name, namespace=namespace),
                spec=V1JobSpec(
                    template=V1PodTemplateSpec(
                        spec=V1PodSpec(
                            containers=[
                                V1Container(
                                    image=image,
                                    command=command,
                                    name="flush",
                                    volume_mounts=[
                                        V1VolumeMount(mount_path="/data", name="queue")
                                    ],
                                    env=env,
                                )
                            ],
                            restart_policy="OnFailure",
                            volumes=[
                                V1Volume(
                                    name="queue",
                                    persistent_volume_claim=(
                                        V1PersistentVolumeClaimVolumeSource(
                                            claim_name=name
                                        )
                                    ),
                                )
                            ],
                            service_account_name=service_account_name,
                        )
                    )
                ),
            ),
        )
    except ApiException as e:
        if e.reason == CONFLICT and json.loads(e.body)["reason"] == ALREADY_EXISTS:
            logger.info(f"using existing job: {name}")
            return batch_api.read_namespaced_job(name, namespace)
        raise
Example #3
0
def delete_completed_jobs(job_labels: Mapping[str, str],
                          client: BatchV1Api = None):
    client = initialize_batch_client() if client is None else client

    logger.info("Deleting succesful jobs.")
    jobs = list_jobs(client, job_labels)
    for job in jobs:
        if job_complete(job):
            name = job.metadata.name
            logger.info(f"Deleting completed job: {name}")
            client.delete_namespaced_job(name,
                                         namespace=job.metadata.namespace)
def apply(manifest_file: str, server: str, namespace: str, user: str,
          password: str, verbose: int, quiet: int):
    """
    Apply manifest file
    """
    logging.basicConfig(
        level=logging.WARN + 10 * quiet - 10 * verbose,
        format=
        '[%(asctime)s] %(levelname)-7s [%(name)s:%(module)s - %(funcName)s:%(lineno)s] %(message)s',
        datefmt='%d.%m.%Y %H:%M:%S')

    logger.info("Running `apply`.")

    with ClusterCredentialsContextManager(server=server,
                                          user=user,
                                          password=password):
        v1_batch = BatchV1Api()

        with io.open(manifest_file, 'r', encoding='utf-8') as the_file:
            yaml_gen = yaml.load_all(the_file.read())
            documents = list(yaml_gen)

        api_response = v1_batch.create_namespaced_job(namespace,
                                                      documents[0],
                                                      pretty=True)
        pprint.pprint(api_response)

    logger.info("Done.")
Example #5
0
def flush_released_pvs(
    api: CoreV1Api,
    batch_api: BatchV1Api,
    command: List[str],
    env: List[V1EnvVar],
    image: str,
    namespace: str,
    service_account_name: str,
):
    """
    Flush persistent volumes.

    Gracefully handle resuming after an interruption, because this is not atomic.
    """
    existing_jobs = {
        job.metadata.name for job in batch_api.list_namespaced_job(namespace).items
    }
    for pv in api.list_persistent_volume().items:
        name = _job_and_pvc_name_from_pv(pv)
        if (
            name not in existing_jobs
            and pv.spec.claim_ref
            and pv.spec.claim_ref.namespace == namespace
            and pv.spec.persistent_volume_reclaim_policy != "Delete"
            and pv.status
            and (pv.status.phase == "Released" or pv.spec.claim_ref.name == name)
        ):
            logger.info(f"flushing unbound pv: {pv.metadata.name}")
            if pv.status.phase != "Bound":
                pvc = _create_pvc(api, name, namespace, pv)
                _bind_pvc(api, pv, pvc)
            _create_flush_job(
                batch_api, command, env, image, name, namespace, service_account_name
            )
Example #6
0
def main():
    """Continuously flush and delete detached persistent volumes."""
    args = parser.parse_args()
    load_incluster_config()
    api = CoreV1Api()
    batch_api = BatchV1Api()
    tasks = [
        partial(
            flush_released_pvs_and_delete_complete_jobs,
            api,
            batch_api,
            args.command,
            args.env,
            args.image,
            args.namespace,
            args.service_account_name,
        ),
        partial(
            delete_detached_pvcs,
            api,
            args.namespace,
            args.claim_prefix,
            timedelta(seconds=args.pvc_cleanup_delay_seconds),
            {},  # detached_pvc_cache
        ),
        partial(delete_unschedulable_pods, api, args.namespace),
    ]
    with ThreadPool(len(tasks)) as pool:
        pool.map(run_task, tasks, chunksize=1)
Example #7
0
    def create_job(self, k8s_job_client: client.BatchV1Api):
        job_creation_response = k8s_job_client.create_namespaced_job(
            body=self.mlcube_job_manifest,
            namespace=self.namespace,
        )

        logging.info("MLCommons Box k8s job created. Status='%s'" %
                     str(job_creation_response.status))
Example #8
0
 def get_batch_v1_api(self):
     """
     BatchV1Api
     :return:
     """
     client = self.get_api_client()
     batch_v1_api = BatchV1Api(client)
     return batch_v1_api
def _inspect_jobs(
    config: Dict[str, Any],
    lock: synchronize.Lock,
    replications_queue: Queue,  # type: ignore
    replication_statuses: Dict[str, str],
) -> None:
    jobs = BatchV1Api().list_job_for_all_namespaces()
    for job in jobs.items:
        spec = job.spec.template.spec
        containers, statuses = _inspect_containers(config, lock,
                                                   replications_queue,
                                                   replication_statuses,
                                                   spec.containers)
        init_containers, init_statuses = (_inspect_containers(
            config, lock, replications_queue, replication_statuses,
            spec.init_containers) if spec.init_containers else ([], []))

        all_statuses = statuses + init_statuses
        all_containers = containers + init_containers
        if len(all_statuses) > 0 and all(
            [status == "Complete" for status in all_statuses]):
            with lock:
                for container in all_containers:
                    del replication_statuses[container["image"]]

            body = {
                "spec": {
                    "template": {
                        "spec": {
                            "containers": containers,
                            "initContainers": init_containers,
                        }
                    }
                }
            }

            BatchV1Api().patch_namespaced_job(
                name=job.metadata.name,
                namespace=job.metadata.namespace,
                body=body,
            )
            logger.info("Patched Deployment: %s, Namespace: %s",
                        job.metadata.name, job.metadata.namespace)
Example #10
0
def cleanup_jobs(namespace: str, name: str, dry: bool):
    core = CoreV1Api()
    batch = BatchV1Api()

    job_name_regex = f'^{name.replace("*", ".*")}$'

    for job_name in find_jobs(batch, namespace, job_name_regex):
        logger.info("delete job: %s", job_name)
        if not dry:
            delete_job(batch, namespace, job_name)

        pod_name_regex = f'^{job_name}-.*$'

        for pod_name in find_pods(core, namespace, pod_name_regex):
            logger.info("delete pod: %s", pod_name)
            if not dry:
                delete_pod(core, namespace, pod_name)
Example #11
0
 def get_job(self):
     api_instance = BatchV1Api(self.api_client)
     jobs = api_instance.list_job_for_all_namespaces()
     jobs_status = []
     for i in jobs.items:
         name = i.metadata.name
         ns = i.metadata.namespace
         start = i.status.start_time
         if i.status.succeeded == 1:
             status = "success"
         elif i.status.failed == 1:
             status = "failed"
         else:
             status = "active"
         jobs_status.append({
             "ns": ns,
             "name": name,
             "start": start,
             "status": status
         })
     return {"desc": "jobs", "result": jobs_status}
Example #12
0
 def __init__(self, path):
     """初始化kubernetes"""
     config.load_kube_config(path)
     self.batch = BatchV1Api()
def create_job(
    namespace: str,
    name: str,
    labels: kopf.Labels,
    annotations: kopf.Annotations,
    spec: kopf.Spec,
    status: kopf.Status,
    patch: kopf.Patch,
    logger: kopf.Logger,
    namespaces_idx: kopf.Index[str, Dict[str, Any]],
    podsettings_idx: kopf.Index[Tuple[str, str], Dict[str, Any]],
    **_: Any,
) -> str:
    ns: Optional[Dict[str, Any]] = None
    for ns in namespaces_idx.get(namespace, []):
        logger.debug("ns: %s", ns)

    if ns is None:
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreationFailed",
                "error": "No Namespace resource found"
            }
        }
        return "JobCreationFailed"

    env = ns["env"]
    team = ns["team"]

    global ENV_CONTEXT  # Caching
    if ENV_CONTEXT is None:
        context = _load_env_context_from_ssm(env)
        if context is None:
            patch["status"] = {
                "orbitJobOperator": {
                    "jobStatus": "JobCreationFailed",
                    "error": "Unable to load Env Context from SSM"
                }
            }
            return "JobCreationFailed"
        else:
            ENV_CONTEXT = context

    node_type = spec.get("compute", {}).get("nodeType", "fargate")
    labels = {
        "app": "orbit-runner",
        "orbit/node-type": node_type,
        "notebook-name": spec.get("notebookName", ""),
        "orbit/attach-security-group": "yes" if node_type == "ec2" else "no",
    }

    podsetting_metadata: Dict[str, Any] = {}
    for podsetting_metadata in podsettings_idx.get(
        (team, spec.get("compute", {}).get("podSetting", None)), []):
        logger.debug("PodSetting: %s", podsetting_metadata)

    job_spec = job_utils.construct_job_spec(
        env=env,
        team=team,
        env_context=ENV_CONTEXT,
        podsetting_metadata=podsetting_metadata,
        orbit_job_spec=spec,
        labels=labels,
    )

    logger.debug("spec: %s", spec)
    if spec.get("schedule"):
        cronjob_id = f"orbit-{namespace}-{spec.get('triggerName')}"
        cron_job_template: V1beta1JobTemplateSpec = V1beta1JobTemplateSpec(
            spec=job_spec)
        cron_job_spec: V1beta1CronJobSpec = V1beta1CronJobSpec(
            job_template=cron_job_template, schedule=spec.get("schedule"))
        job = V1beta1CronJob(
            api_version="batch/v1beta1",
            kind="CronJob",
            metadata=V1ObjectMeta(name=cronjob_id,
                                  labels={
                                      **labels,
                                      **spec.get("compute", {}).get(
                                          "labels", {})
                                  },
                                  namespace=namespace),
            status=V1beta1CronJobStatus(),
            spec=cron_job_spec,
        )
        kopf.adopt(job, nested="spec.template")
        cron_job_instance: V1beta1CronJob = BatchV1beta1Api(
        ).create_namespaced_cron_job(namespace=namespace, body=job)
        cronjob_instance_metadata: V1ObjectMeta = cron_job_instance.metadata
        logger.debug("Started Cron Job: %s", cronjob_instance_metadata.name)
        patch["metadata"] = {"labels": {"k8sJobType": "CronJob"}}
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreated",
                "jobName": cronjob_instance_metadata.name,
                "nodeType": node_type,
            }
        }
        return "CronJobCreated"
    else:
        job = V1Job(
            api_version="batch/v1",
            kind="Job",
            metadata=V1ObjectMeta(labels={
                **labels,
                **spec.get("compute", {}).get("labels", {})
            }),
            spec=job_spec,
        )

        kopf.adopt(job, nested="spec.template")
        job_instance: V1Job = BatchV1Api().create_namespaced_job(
            namespace=namespace, body=job)

        job_instance_metadata: V1ObjectMeta = job_instance.metadata
        logger.debug("Started Job: %s", job_instance_metadata.name)
        patch["metadata"] = {"labels": {"k8sJobType": "Job"}}
        patch["status"] = {
            "orbitJobOperator": {
                "jobStatus": "JobCreated",
                "jobName": job_instance_metadata.name,
                "nodeType": node_type,
            }
        }
        return "JobCreated"
Example #14
0
def delete_job(batch: BatchV1Api, namespace: str, job_name: str):
    batch.delete_namespaced_job(job_name, namespace, V1DeleteOptions())
Example #15
0
def find_jobs(batch: BatchV1Api, namespace: str, name_regex: str):
    for job in batch.list_namespaced_job(namespace).items:
        if (re.match(name_regex, job.metadata.name)
                and job.status.succeeded == 1):
            yield job.metadata.name
Example #16
0
def test_flush_manager(options: Dict[str, Any], emulator: str, web: str):
    print("starting test")
    api = CoreV1Api()
    # max number of loops to run when waiting for kube actions to complete
    max_wait_loops = 20 if options["cluster"] is None else 60

    # server has invalid PUBSUB_EMULATOR, so that only flush can deliver messages
    static_pvs = options["cluster"] is None
    if static_pvs:
        create_static_pvs(api)

    print("waiting for pods to be healthy")
    for _ in range(max_wait_loops):
        if all(pod.status.phase == "Running"
               for pod in api.list_namespaced_pod("default").items):
            break
        time.sleep(1)
    else:
        assert False, "pods did not become healthy"

    # create a subscription to the defined topic
    print("creating pubsub subscription")
    os.environ["PUBSUB_EMULATOR_HOST"] = emulator
    sub_client = SubscriberClient()
    topic_path = "projects/{project}/topics/{topic}".format(**options)
    subscription_path = "projects/{project}/subscriptions/{topic}".format(
        **options)
    try:
        sub_client.create_subscription(subscription_path,
                                       topic_path,
                                       retry=None)
    except AlreadyExists:
        pass

    print("posting message 0")
    requests.post(web, headers={
        "host": "web"
    }, json={
        "id": 0
    }).raise_for_status()
    print("setting up race condition: attached pvc is also deleted")
    delete_pvcs(api)
    print("setting up race condition: pod unschedulable due to missing pvc")
    with pytest.raises(ApiException) as excinfo:
        restart_web_pods(api)
    assert excinfo.value.reason == "Conflict"
    print("posting message 1")
    with pytest.raises(requests.exceptions.ConnectionError):
        requests.post(web, headers={
            "host": "web"
        }, json={
            "id": 1
        }).raise_for_status()

    print("starting flush-manager")
    # TODO optionally run flush-manager via subprocess.Popen, to ensure testing
    # current code and enable code coverage
    _sa = kube_resource("kube/flush-manager.sa.yml", **options)
    _cluster_role = kube_resource("kube/flush-manager.clusterrole.yml",
                                  **options)
    _cluster_role_binding = kube_resource(
        "kube/flush-manager.clusterrolebinding.yml", **options)
    _role = kube_resource("kube/flush-manager.role.yml", **options)
    _role_binding = kube_resource("kube/flush-manager.rolebinding.yml",
                                  **options)
    _deploy = kube_resource("kube/flush-manager.deploy.yml", **options)
    with _sa, _cluster_role, _cluster_role_binding, _role, _role_binding, _deploy:
        print("posting message 2 until successful")
        for i in range(max_wait_loops):
            try:
                requests.post(web, headers={
                    "host": "web"
                }, json={
                    "id": 2
                }).raise_for_status()
            except requests.exceptions.ConnectionError:
                if i > 0 and static_pvs:
                    create_static_pvs(api)
            else:
                break
            time.sleep(1)
        else:
            assert False, "pod did not recover"
        # scale to 0 pods
        print("scaling web to 0 pods")
        AppsV1Api().patch_namespaced_stateful_set_scale(
            name="web",
            namespace="default",
            body=V1StatefulSet(api_version="apps/v1",
                               kind="StatefulSet",
                               spec=dict(replicas=0)),
        )
        # wait for no pvcs
        print("waiting for cleanup to complete")
        for _ in range(max_wait_loops):
            if not api.list_persistent_volume().items:
                break
            time.sleep(1)
        else:
            print("pvs were not cleaned up")
            assert [] == api.list_persistent_volume().items
    # assert jobs and pvcs also deleted
    assert [] == list_pvcs(api)
    assert [] == BatchV1Api().list_namespaced_job("default").items
    # assert received message id 0 and 2
    assert [b'{"id": 0}', b'{"id": 2}'] == [
        element.message.data
        for element in sub_client.pull(subscription_path, 2).received_messages
    ]
Example #17
0
    def _job_dispatch(self):
        api = BatchV1Api(get_kubernetes_api_client())

        def _actual_work():
            idle = True
            try:
                for item in Task.objects.filter(
                        status=TASK.SCHEDULED).order_by("create_time"):
                    idle = False
                    conf = json.loads(item.settings.container_config)
                    common_name = "task-exec-{}".format(item.uuid)
                    shared_storage_name = "shared-{}".format(item.uuid)
                    user_storage_name = "user-{}".format(item.uuid)
                    user_dir = "/cloud_scheduler_userspace/"
                    create_namespace()
                    create_userspace_pvc()
                    if not get_userspace_pvc():
                        item.status = TASK.FAILED
                        item.logs_get = True
                        item.logs = "Failed to get user space storage"
                        item.save(force_update=True)
                    else:
                        try:
                            if not config_checker(conf):
                                raise ValueError(
                                    "Invalid config for TaskSettings: {}".
                                    format(item.settings.uuid))
                            # kubernetes part
                            shell = conf['shell']
                            commands = []
                            mem_limit = conf['memory_limit']
                            time_limit = item.settings.time_limit
                            working_dir = conf['working_path']
                            image = conf['image']
                            shared_pvc = conf['persistent_volume']['name']
                            shared_mount_path = conf['persistent_volume'][
                                'mount_path']
                            script_path = conf['task_script_path']

                            commands.append('mkdir -p {}'.format(working_dir))
                            commands.append('cp -r {}/* {}'.format(
                                user_dir, working_dir))
                            # snapshot
                            commands.append('cp -r {}/* {}'.format(
                                shared_mount_path + '/' + script_path,
                                working_dir))
                            # overwrite
                            commands.append(
                                'chmod -R +x {}'.format(working_dir))
                            commands.append('cd {}'.format(working_dir))
                            commands.append(
                                'timeout --signal TERM {timeout} {shell} -c \'{commands}\''
                                .format(timeout=time_limit,
                                        shell=shell,
                                        commands=';'.join(conf['commands'])))

                            shared_mount = client.V1VolumeMount(
                                mount_path=shared_mount_path,
                                name=shared_storage_name,
                                read_only=True)
                            user_mount = client.V1VolumeMount(
                                mount_path='/cloud_scheduler_userspace/',
                                name=user_storage_name,
                                sub_path="user_{}_task_{}".format(
                                    item.user_id, item.settings_id),
                                read_only=True)
                            env_username = client.V1EnvVar(
                                name="CLOUD_SCHEDULER_USER",
                                value=item.user.username)
                            env_user_uuid = client.V1EnvVar(
                                name="CLOUD_SCHEDULER_USER_UUID",
                                value=item.user.uuid)
                            container_settings = {
                                'name': 'task-container',
                                'image': image,
                                'volume_mounts': [shared_mount, user_mount],
                                'command': [shell],
                                'args': ['-c', ';'.join(commands)],
                                'env': [env_username, env_user_uuid]
                            }
                            if mem_limit:
                                container_settings[
                                    'resources'] = client.V1ResourceRequirements(
                                        limits={'memory': mem_limit})
                            container = client.V1Container(
                                **container_settings)
                            persistent_volume_claim = client.V1PersistentVolumeClaimVolumeSource(
                                claim_name=shared_pvc)
                            user_volume_claim = client.V1PersistentVolumeClaimVolumeSource(
                                claim_name=USERSPACE_NAME)
                            volume = client.V1Volume(
                                name=shared_storage_name,
                                persistent_volume_claim=persistent_volume_claim
                            )
                            user_volume = client.V1Volume(
                                name=user_storage_name,
                                persistent_volume_claim=user_volume_claim)
                            template = client.V1PodTemplateSpec(
                                metadata=client.V1ObjectMeta(
                                    labels={"task-exec": item.uuid}),
                                spec=client.V1PodSpec(
                                    restart_policy="Never",
                                    containers=[container],
                                    volumes=[volume, user_volume]))
                            spec = client.V1JobSpec(
                                template=template,
                                backoff_limit=0,
                                active_deadline_seconds=GLOBAL_TASK_TIME_LIMIT)
                            job = client.V1Job(
                                api_version="batch/v1",
                                kind="Job",
                                metadata=client.V1ObjectMeta(name=common_name),
                                spec=spec)
                            _ = api.create_namespaced_job(
                                namespace=KUBERNETES_NAMESPACE, body=job)
                            item.status = TASK.WAITING
                            item.save(force_update=True)
                        except ApiException as ex:
                            LOGGER.warning("Kubernetes ApiException %d: %s",
                                           ex.status, ex.reason)
                        except ValueError as ex:
                            LOGGER.warning(ex)
                            item.status = TASK.FAILED
                            item.save(force_update=True)
                        except Exception as ex:
                            LOGGER.error(ex)
                            item.status = TASK.FAILED
                            item.save(force_update=True)
            except Exception as ex:
                LOGGER.error(ex)
            if idle:
                time.sleep(1)

        while True:
            _actual_work()
            if self.test:
                break
Example #18
0
    def _job_monitor(self):
        api = CoreV1Api(get_kubernetes_api_client())
        job_api = BatchV1Api(get_kubernetes_api_client())

        def _actual_work():
            idle = True
            try:
                for item in Task.objects.filter(
                        Q(status=TASK.WAITING) | Q(status=TASK.RUNNING)
                        | Q(status=TASK.PENDING)).order_by("create_time"):
                    common_name = "task-exec-{}".format(item.uuid)
                    try:
                        response = api.list_namespaced_pod(
                            namespace=KUBERNETES_NAMESPACE,
                            label_selector="task-exec={}".format(item.uuid))
                        if response.items:
                            status = response.items[0].status.phase
                            new_status = item.status
                            deleting = response.items[
                                0].metadata.deletion_timestamp
                            if status == 'Running':
                                new_status = TASK.RUNNING
                            elif status == 'Succeeded':
                                new_status = TASK.SUCCEEDED
                            elif status == 'Pending' and not deleting:
                                new_status = TASK.PENDING
                            elif status == 'Failed':
                                new_status = TASK.FAILED
                            if new_status != item.status:
                                if status in ('Succeeded', 'Failed'):
                                    exit_code = None
                                    detailed_status = response.items[
                                        0].status.container_statuses
                                    if detailed_status and detailed_status[
                                            0].state.terminated:
                                        exit_code = detailed_status[
                                            0].state.terminated.exit_code
                                        LOGGER.debug(exit_code)
                                    response = api.read_namespaced_pod_log(
                                        name=response.items[0].metadata.name,
                                        namespace=KUBERNETES_NAMESPACE)
                                    if response:
                                        item.logs = response
                                    item.logs_get = True
                                    if exit_code:
                                        item.exit_code = exit_code
                                    if exit_code == 124:  # SIGTERM by TLE
                                        item.logs += "\nTime limit exceeded when executing job."
                                        new_status = TASK.TLE
                                    elif exit_code == 137:  # SIGKILL by MLE
                                        item.logs += "\nMemory limit exceeded when executing job."
                                        new_status = TASK.MLE
                                    job_api.delete_namespaced_job(
                                        name=common_name,
                                        namespace=KUBERNETES_NAMESPACE,
                                        body=client.V1DeleteOptions(
                                            propagation_policy='Foreground',
                                            grace_period_seconds=3))
                                item.status = new_status
                                idle = False
                                item.save(force_update=True)
                        # else wait for a period because it takes time for corresponding pod to be initialized
                    except ApiException as ex:
                        LOGGER.warning(ex)
                for item in Task.objects.filter(status=TASK.DELETING):
                    common_name = "task-exec-{}".format(item.uuid)
                    try:
                        _ = job_api.delete_namespaced_job(
                            name=common_name,
                            namespace=KUBERNETES_NAMESPACE,
                            body=client.V1DeleteOptions(
                                propagation_policy='Foreground',
                                grace_period_seconds=5))
                        LOGGER.info(
                            "The kubernetes job of Task: %s deleted successfully",
                            item.uuid)
                        item.delete()
                    except ApiException as ex:
                        if ex.status == 404:
                            item.delete()
                        else:
                            LOGGER.warning("Kubernetes ApiException %d: %s",
                                           ex.status, ex.reason)
                    except Exception as ex:
                        LOGGER.error(ex)

            except Exception as ex:
                LOGGER.error(ex)
            if idle:
                time.sleep(1)

        while True:
            _actual_work()
            if self.test:
                break
Example #19
0
def get_kubernetes_api_clients() -> (CoreV1Api, BatchV1Api):
    logger.info("Creating Kubernetes api clients")
    load_kubernetes_config(os.environ.get("KUBECONFIG", None))
    api_client = ApiClient(kubernetes.client.Configuration())
    return CoreV1Api(api_client=api_client), BatchV1Api(api_client=api_client)
Example #20
0
def create_register_job(dataset, email, password) -> Optional[str]:
    config.load_incluster_config()
    batch_v1 = BatchV1Api()
    # job names cannot include underscores
    job_name = f'register-{dataset}'.replace('_', '-')
    # TODO: Handle datasets that require extra dependencies (matplotlib, etc.)
    #  Also remove those extra dependencies from requirements.txt
    try:
        batch_v1.create_namespaced_job('default', {
            "apiVersion": "batch/v1",
            "kind": "Job",
            "metadata": {"name": job_name},
            "spec": {
                "template": {
                    "spec": {
                        "containers": [{
                            "name": job_name,
                            "image": "cyclotomic/blueno-registration-pipelines",
                            "imagePullPolicy": "Always",
                            "command": ["python"],
                            "args": [
                                "main.py",
                                dataset,
                                email,
                                password
                            ],
                            "env": [
                                {
                                    "name": "FILESYSTEM_STORE_ROOT",
                                    "value": "/root"
                                },
                                {
                                    "name": "BLUENO_SERVER",
                                    "value": "http://blueno-server"
                                }
                            ],
                            "volumeMounts": [
                                {
                                    # We mount to /root because tensorflow
                                    # datasets still downloads data to
                                    # the default (~/tensorflow_datasets)
                                    "mountPath": "/root",
                                    "name": "nfs"
                                }
                            ],
                            "resources": {
                                "requests": {
                                    "memory": "2Gi",
                                }
                            },
                        }],
                        "volumes": [
                            {
                                "name": "nfs",
                                "persistentVolumeClaim": {
                                    "claimName": "blueno-nfs"
                                }
                            }
                        ],
                        # We avoid restarting on failure because a new
                        # NFS mount is needed if the NFS server pod fails
                        "restartPolicy": "Never"
                    }
                }
            }
        })
        return None
    except ApiException as e:
        return json.loads(e.body)['message']