Beispiel #1
0
def get_streaming_app_cronjob(
    name: str = "test-cronjob",
    input_topics: Optional[str] = None,
    output_topic: Optional[str] = "output-topic",
    error_topic: Optional[str] = "error-topic",
    env_prefix: str = "APP_",
    pipeline: Optional[str] = None,
) -> V1beta1CronJob:
    env = get_env(
        input_topics,
        output_topic,
        error_topic,
        env_prefix=env_prefix,
    )
    container = V1Container(name="test-container", env=env)
    pod_spec = V1PodSpec(containers=[container])
    pod_template_spec = V1PodTemplateSpec(spec=pod_spec)
    job_spec = V1JobSpec(
        template=pod_template_spec,
        selector=None,
    )
    job_template = V1beta1JobTemplateSpec(spec=job_spec)
    spec = V1beta1CronJobSpec(job_template=job_template, schedule="* * * * *")
    metadata = get_metadata(name, pipeline=pipeline)
    return V1beta1CronJob(metadata=metadata, spec=spec)
Beispiel #2
0
def _create_flush_job(
    batch_api: BatchV1Api,
    command: List[str],
    env: List[V1EnvVar],
    image: str,
    name: str,
    namespace: str,
    service_account_name: str,
) -> V1Job:
    logger.info(f"creating job: {name}")
    try:
        return batch_api.create_namespaced_job(
            namespace=namespace,
            body=V1Job(
                api_version="batch/v1",
                kind="Job",
                metadata=V1ObjectMeta(name=name, namespace=namespace),
                spec=V1JobSpec(
                    template=V1PodTemplateSpec(
                        spec=V1PodSpec(
                            containers=[
                                V1Container(
                                    image=image,
                                    command=command,
                                    name="flush",
                                    volume_mounts=[
                                        V1VolumeMount(mount_path="/data", name="queue")
                                    ],
                                    env=env,
                                )
                            ],
                            restart_policy="OnFailure",
                            volumes=[
                                V1Volume(
                                    name="queue",
                                    persistent_volume_claim=(
                                        V1PersistentVolumeClaimVolumeSource(
                                            claim_name=name
                                        )
                                    ),
                                )
                            ],
                            service_account_name=service_account_name,
                        )
                    )
                ),
            ),
        )
    except ApiException as e:
        if e.reason == CONFLICT and json.loads(e.body)["reason"] == ALREADY_EXISTS:
            logger.info(f"using existing job: {name}")
            return batch_api.read_namespaced_job(name, namespace)
        raise
 def cron_jobs(self):
     env_prefix = "APP_"
     envs = [
         V1EnvVar(name="ENV_PREFIX", value=env_prefix),
         V1EnvVar(name=env_prefix + "OUTPUT_TOPIC", value="output-topic"),
     ]
     container = V1Container(name="test-container", env=envs)
     pod_spec = V1PodSpec(containers=[container])
     pod_template_spec = V1PodTemplateSpec(spec=pod_spec)
     job_spec = V1JobSpec(
         template=pod_template_spec,
         selector="",
     )
     job_template = V1beta1JobTemplateSpec(spec=job_spec)
     spec = V1beta1CronJobSpec(job_template=job_template,
                               schedule="* * * * *")
     return [
         V1beta1CronJob(metadata=V1ObjectMeta(name="test-cronjob"),
                        spec=spec)
     ]
Beispiel #4
0
 def _create_kube_job(self,
                      op_inst,
                      podspec,
                      namespace=KubernetesConfig.K8S_NAMESPACE):
     job_name = op_inst.guid + "-job"
     job_metadata = client.V1ObjectMeta(
         name=job_name,
         namespace=namespace,
         labels={KubernetesConfig.K8S_LABELS_OPGUID:
                 op_inst.guid})  # Label for the service to bind to
     pod_name = op_inst.guid + "-pod"
     pod_metadata = client.V1ObjectMeta(
         name=pod_name,
         namespace=namespace,
         labels={KubernetesConfig.K8S_LABELS_OPGUID:
                 op_inst.guid})  # Label for the service to bind to
     jobspec = V1JobSpec(
         template=V1PodTemplateSpec(metadata=pod_metadata, spec=podspec))
     kube_job = V1Job(metadata=job_metadata, spec=jobspec)
     return kube_job
    def __init__(self, namespace: str = "default",
                 job_spec_template: V1JobSpec = None,
                 print_output: bool = False):
        """Initialize a DataMoverJob object.

        :param namespace: The namespace which applies to the job. Defaults to the default namespace.
        :param job_spec_template: A Kubernetes job spec object. This can be used to configure any of the
            optional properties of a job spec if desired. It is intended that this can be used as a template
            to provide optional parameters of the V1JobSpec object. Derived classes should use a copy of the
            job spec and replace the required template field with the appropriate pod template spec for the
            particular operation being performed.
        :param print_output: If True enable information to be printed to the console. Default value is False.
        """
        if namespace is None:
            self.namespace = "default"
        else:
            self.namespace = namespace

        if job_spec_template is None:
            self.__job_spec = V1JobSpec(template=V1PodTemplateSpec())
        else:
            self.__job_spec = job_spec_template

        self.print_output = print_output
Beispiel #6
0
    def launch(self, name, docker_config: DockerConfig, mounts, env, blocking: bool = True):
        name = (self.prefix + 'update-' + name.lower()).replace('_', '-')

        # If we have been given a username or password for the registry, we have to
        # update it, if we haven't been, make sure its been cleaned up in the system
        # so we don't leave passwords lying around
        pull_secret_name = f'{name}-job-pull-secret'
        use_pull_secret = False
        try:
            # Check if there is already a username/password defined for this job
            current_pull_secret = self.api.read_namespaced_secret(pull_secret_name, self.namespace,
                                                                  _request_timeout=API_TIMEOUT)
        except ApiException as error:
            if error.status != 404:
                raise
            current_pull_secret = None

        if docker_config.registry_username or docker_config.registry_password:
            use_pull_secret = True
            # Build the secret we want to make
            new_pull_secret = V1Secret(
                metadata=V1ObjectMeta(name=pull_secret_name, namespace=self.namespace),
                type='kubernetes.io/dockerconfigjson',
                string_data={
                    '.dockerconfigjson': create_docker_auth_config(
                        image=docker_config.image,
                        username=docker_config.registry_username,
                        password=docker_config.registry_password,
                    )
                }
            )

            # Send it to the server
            if current_pull_secret:
                self.api.replace_namespaced_secret(pull_secret_name, namespace=self.namespace,
                                                   body=new_pull_secret, _request_timeout=API_TIMEOUT)
            else:
                self.api.create_namespaced_secret(namespace=self.namespace, body=new_pull_secret,
                                                  _request_timeout=API_TIMEOUT)
        elif current_pull_secret:
            # If there is a password set in kubernetes, but not in our configuration clear it out
            self.api.delete_namespaced_secret(pull_secret_name, self.namespace, _request_timeout=API_TIMEOUT)

        try:
            self.batch_api.delete_namespaced_job(name=name, namespace=self.namespace,
                                                 propagation_policy='Background', _request_timeout=API_TIMEOUT)
            while True:
                self.batch_api.read_namespaced_job(namespace=self.namespace, name=name,
                                                   _request_timeout=API_TIMEOUT)
                time.sleep(1)
        except ApiException:
            pass

        volumes = []
        volume_mounts = []

        for index, mnt in enumerate(mounts):
            volumes.append(V1Volume(
                name=f'mount-{index}',
                persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
                    claim_name=mnt['volume'],
                    read_only=False
                ),
            ))

            volume_mounts.append(V1VolumeMount(
                name=f'mount-{index}',
                mount_path=mnt['dest_path'],
                sub_path=mnt['source_path'],
                read_only=False,
            ))

        if CONFIGURATION_CONFIGMAP:
            volumes.append(V1Volume(
                name='mount-configuration',
                config_map=V1ConfigMapVolumeSource(
                    name=CONFIGURATION_CONFIGMAP
                ),
            ))

            volume_mounts.append(V1VolumeMount(
                name='mount-configuration',
                mount_path='/etc/assemblyline/config.yml',
                sub_path="config",
                read_only=True,
            ))

        section = 'service'
        labels = {
            'app': 'assemblyline',
            'section': section,
            'privilege': 'core',
            'component': 'update-script',
        }
        labels.update(self.extra_labels)

        metadata = V1ObjectMeta(
            name=name,
            labels=labels
        )

        environment_variables = [V1EnvVar(name=_e.name, value=_e.value) for _e in docker_config.environment]
        environment_variables.extend([V1EnvVar(name=k, value=v) for k, v in env.items()])
        environment_variables.extend([V1EnvVar(name=k, value=os.environ[k])
                                      for k in INHERITED_VARIABLES if k in os.environ])
        environment_variables.append(V1EnvVar(name="LOG_LEVEL", value=self.log_level))

        cores = docker_config.cpu_cores
        memory = docker_config.ram_mb
        memory_min = min(docker_config.ram_mb_min, memory)

        container = V1Container(
            name=name,
            image=docker_config.image,
            command=docker_config.command,
            env=environment_variables,
            image_pull_policy='Always',
            volume_mounts=volume_mounts,
            resources=V1ResourceRequirements(
                limits={'cpu': cores, 'memory': f'{memory}Mi'},
                requests={'cpu': cores / 4, 'memory': f'{memory_min}Mi'},
            )
        )

        pod = V1PodSpec(
            volumes=volumes,
            restart_policy='Never',
            containers=[container],
            priority_class_name=self.priority_class,
        )

        if use_pull_secret:
            pod.image_pull_secrets = [V1LocalObjectReference(name=pull_secret_name)]

        job = V1Job(
            metadata=metadata,
            spec=V1JobSpec(
                backoff_limit=1,
                completions=1,
                template=V1PodTemplateSpec(
                    metadata=metadata,
                    spec=pod
                )
            )
        )

        status = self.batch_api.create_namespaced_job(namespace=self.namespace, body=job,
                                                      _request_timeout=API_TIMEOUT).status

        if blocking:
            try:
                while not (status.failed or status.succeeded):
                    time.sleep(3)
                    status = self.batch_api.read_namespaced_job(namespace=self.namespace, name=name,
                                                                _request_timeout=API_TIMEOUT).status

                self.batch_api.delete_namespaced_job(name=name, namespace=self.namespace,
                                                     propagation_policy='Background', _request_timeout=API_TIMEOUT)
            except ApiException as error:
                if error.status != 404:
                    raise
Beispiel #7
0
    def create_simple_job_spec(
            self,
            experiment: Experiment,
            name: str,
            image: str,
            min_cpu: int,
            min_mem: int,
            command: Optional[List[str]] = None,
            env: Dict[str, str] = {},
            accelerator: Optional[Accelerator] = None,
            accelerator_count: int = 1,
            namespace: str = k.DEFAULT_NAMESPACE,
            machine_type: Optional[MachineType] = None,
            preemptible: bool = True,
            preemptible_tpu: bool = True,
            tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]:
        """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster

    Args:
    name: job name
    image: container image url (gcr.io/...)
    min_cpu: minimum cpu needed, in milli-cpu
    min_mem: minimum memory needed, in MB
    command: command to execute, None = container entrypoint
    args: args to pass to command
    env: environment vars for container
    accelerator: accelerator type, None=cpu only
    accelerator_count: accelerator count
    namespace: kubernetes namespace
    machine_type: machine type, None=default for mode (cpu/gpu)
    preemptible: use preemptible instance
    preemptible_tpu: use preemptible tpus
    tpu_driver: tpu driver to use

    Returns:
    JobSpec on success, None otherwise
    """

        args = conf.experiment_to_args(experiment.kwargs, experiment.args)

        # ------------------------------------------------------------------------
        # container

        # tpu/gpu resources
        container_resources = V1ResourceRequirements(
            requests=Cluster.container_requests(min_cpu, min_mem),
            limits=Cluster.container_limits(
                accelerator,
                accelerator_count,
                preemptible_tpu,
            ),
        )

        container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()]

        # this is a simple 1-container, 1-pod job, so we just name the
        # container the same thing (minus the generated suffix) as the job itself
        container = V1Container(
            name=name,
            image=image,
            command=command,
            args=args,
            resources=container_resources,
            env=container_env,
            image_pull_policy='Always',
        )

        # ------------------------------------------------------------------------
        # template

        # todo: should we support anything other than a 'never' restart policy?
        # see this for discussion
        # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy

        tolerations = Cluster.tolerations(preemptible=preemptible)

        # backoff count plus 'OnFailure' may be correct here
        template_spec = V1PodSpec(
            restart_policy='Never',
            containers=[container],
            tolerations=tolerations,
            node_selector=Cluster.node_selector(
                preemptible=preemptible,
                machine_type=machine_type,
                accelerator=accelerator,
            ),
            host_ipc=True,
        )

        template = V1PodTemplateSpec(
            metadata=Cluster.template_metadata(
                accelerator=accelerator,
                tpu_driver=tpu_driver,
            ),
            spec=template_spec,
        )

        # ------------------------------------------------------------------------
        # job
        job_spec = V1JobSpec(template=template, backoff_limit=4)

        return JobSpec.get_or_create(
            experiment=experiment,
            spec=ApiClient().sanitize_for_serialization(job_spec),
            platform=Platform.GKE,
        )
Beispiel #8
0
def construct_job_spec(
    env: str,
    team: str,
    env_context: Dict[str, Any],
    podsetting_metadata: Dict[str, Any],
    orbit_job_spec: kopf.Spec,
    labels: kopf.Labels,
) -> V1JobSpec:
    compute = orbit_job_spec.get("compute", {
        "computeType": "eks",
        "nodeType": "fargate"
    })

    # Convert all the compute parameters to their SDK equivalents until we fix the SDK
    # and python-utils
    converted_compute = {}
    if "computeType" in compute:
        converted_compute["compute_type"] = compute["computeType"]
    if "nodeType" in compute:
        converted_compute["node_type"] = compute["nodeType"]
    if "env" in compute:
        converted_compute["env_vars"] = compute["env"]
    if "snsTopicName" in compute:
        converted_compute["sns.topic.name"] = compute["snsTopicName"]
    if "priorityClassName" in compute:
        converted_compute["priorityClassName"] = compute["priorityClassName"]
    if "podSetting" in compute:
        converted_compute["podsetting"] = compute["podSetting"]
    if "labels" in compute:
        converted_compute["labels"] = compute["labels"]
    if "container" in compute:
        if "concurrentProcesses" in compute["container"]:
            converted_compute["container"] = {
                "p_concurrent": compute["container"]["concurrentProcesses"]
            }

    pod_labels = {
        **labels,
        **orbit_job_spec.get("compute", {}).get("labels", {}),
    }
    pod_labels["app"] = "orbit-runner"
    pod_labels[f"orbit/{podsetting_metadata.get('name', None)}"] = ""

    pod_env = {
        "task_type": orbit_job_spec["taskType"],
        "tasks": json.dumps({"tasks": orbit_job_spec["tasks"]}),
        "compute": json.dumps({"compute": converted_compute}),
        "AWS_ORBIT_ENV": env,
        "AWS_ORBIT_TEAM_SPACE": team,
    }
    pod_image = (podsetting_metadata["image"]
                 if podsetting_metadata["image"] is not None else
                 (f"{env_context['Images']['JupyterUser']['Repository']}:"
                  f"{env_context['Images']['JupyterUser']['Version']}"))

    pod_params = {
        # "name": f"run-{orbit_job_spec['taskType']}",
        "cmd": ["bash", "-c", "python /opt/python-utils/notebook_cli.py"],
        "port": 22,
        "image": pod_image,
        "service_account": "default-editor",
        "run_privileged": False,
        "allow_privilege_escalation": True,
        "env": pod_env,
        "priority_class_name": orbit_job_spec.get("priorityClassName"),
        "labels": pod_labels,
        "run_as_uid": 1000,
        "run_as_gid": 100,
    }

    pod = _make_pod(**pod_params)
    pod.spec.restart_policy = "Never"
    return V1JobSpec(backoff_limit=0,
                     template=pod,
                     ttl_seconds_after_finished=int(
                         os.environ.get("TTL_SECONDS_AFTER_FINISHED", 120)))
def mock_job() -> V1Job:
    meta = V1ObjectMeta(namespace="default", name="some-job")
    spec = V1JobSpec(template=V1PodTemplate())
    status = V1JobStatus(conditions=[])
    return V1Job(metadata=meta, spec=spec, status=status)