def get_streaming_app_cronjob( name: str = "test-cronjob", input_topics: Optional[str] = None, output_topic: Optional[str] = "output-topic", error_topic: Optional[str] = "error-topic", env_prefix: str = "APP_", pipeline: Optional[str] = None, ) -> V1beta1CronJob: env = get_env( input_topics, output_topic, error_topic, env_prefix=env_prefix, ) container = V1Container(name="test-container", env=env) pod_spec = V1PodSpec(containers=[container]) pod_template_spec = V1PodTemplateSpec(spec=pod_spec) job_spec = V1JobSpec( template=pod_template_spec, selector=None, ) job_template = V1beta1JobTemplateSpec(spec=job_spec) spec = V1beta1CronJobSpec(job_template=job_template, schedule="* * * * *") metadata = get_metadata(name, pipeline=pipeline) return V1beta1CronJob(metadata=metadata, spec=spec)
def _create_flush_job( batch_api: BatchV1Api, command: List[str], env: List[V1EnvVar], image: str, name: str, namespace: str, service_account_name: str, ) -> V1Job: logger.info(f"creating job: {name}") try: return batch_api.create_namespaced_job( namespace=namespace, body=V1Job( api_version="batch/v1", kind="Job", metadata=V1ObjectMeta(name=name, namespace=namespace), spec=V1JobSpec( template=V1PodTemplateSpec( spec=V1PodSpec( containers=[ V1Container( image=image, command=command, name="flush", volume_mounts=[ V1VolumeMount(mount_path="/data", name="queue") ], env=env, ) ], restart_policy="OnFailure", volumes=[ V1Volume( name="queue", persistent_volume_claim=( V1PersistentVolumeClaimVolumeSource( claim_name=name ) ), ) ], service_account_name=service_account_name, ) ) ), ), ) except ApiException as e: if e.reason == CONFLICT and json.loads(e.body)["reason"] == ALREADY_EXISTS: logger.info(f"using existing job: {name}") return batch_api.read_namespaced_job(name, namespace) raise
def cron_jobs(self): env_prefix = "APP_" envs = [ V1EnvVar(name="ENV_PREFIX", value=env_prefix), V1EnvVar(name=env_prefix + "OUTPUT_TOPIC", value="output-topic"), ] container = V1Container(name="test-container", env=envs) pod_spec = V1PodSpec(containers=[container]) pod_template_spec = V1PodTemplateSpec(spec=pod_spec) job_spec = V1JobSpec( template=pod_template_spec, selector="", ) job_template = V1beta1JobTemplateSpec(spec=job_spec) spec = V1beta1CronJobSpec(job_template=job_template, schedule="* * * * *") return [ V1beta1CronJob(metadata=V1ObjectMeta(name="test-cronjob"), spec=spec) ]
def _create_kube_job(self, op_inst, podspec, namespace=KubernetesConfig.K8S_NAMESPACE): job_name = op_inst.guid + "-job" job_metadata = client.V1ObjectMeta( name=job_name, namespace=namespace, labels={KubernetesConfig.K8S_LABELS_OPGUID: op_inst.guid}) # Label for the service to bind to pod_name = op_inst.guid + "-pod" pod_metadata = client.V1ObjectMeta( name=pod_name, namespace=namespace, labels={KubernetesConfig.K8S_LABELS_OPGUID: op_inst.guid}) # Label for the service to bind to jobspec = V1JobSpec( template=V1PodTemplateSpec(metadata=pod_metadata, spec=podspec)) kube_job = V1Job(metadata=job_metadata, spec=jobspec) return kube_job
def __init__(self, namespace: str = "default", job_spec_template: V1JobSpec = None, print_output: bool = False): """Initialize a DataMoverJob object. :param namespace: The namespace which applies to the job. Defaults to the default namespace. :param job_spec_template: A Kubernetes job spec object. This can be used to configure any of the optional properties of a job spec if desired. It is intended that this can be used as a template to provide optional parameters of the V1JobSpec object. Derived classes should use a copy of the job spec and replace the required template field with the appropriate pod template spec for the particular operation being performed. :param print_output: If True enable information to be printed to the console. Default value is False. """ if namespace is None: self.namespace = "default" else: self.namespace = namespace if job_spec_template is None: self.__job_spec = V1JobSpec(template=V1PodTemplateSpec()) else: self.__job_spec = job_spec_template self.print_output = print_output
def launch(self, name, docker_config: DockerConfig, mounts, env, blocking: bool = True): name = (self.prefix + 'update-' + name.lower()).replace('_', '-') # If we have been given a username or password for the registry, we have to # update it, if we haven't been, make sure its been cleaned up in the system # so we don't leave passwords lying around pull_secret_name = f'{name}-job-pull-secret' use_pull_secret = False try: # Check if there is already a username/password defined for this job current_pull_secret = self.api.read_namespaced_secret(pull_secret_name, self.namespace, _request_timeout=API_TIMEOUT) except ApiException as error: if error.status != 404: raise current_pull_secret = None if docker_config.registry_username or docker_config.registry_password: use_pull_secret = True # Build the secret we want to make new_pull_secret = V1Secret( metadata=V1ObjectMeta(name=pull_secret_name, namespace=self.namespace), type='kubernetes.io/dockerconfigjson', string_data={ '.dockerconfigjson': create_docker_auth_config( image=docker_config.image, username=docker_config.registry_username, password=docker_config.registry_password, ) } ) # Send it to the server if current_pull_secret: self.api.replace_namespaced_secret(pull_secret_name, namespace=self.namespace, body=new_pull_secret, _request_timeout=API_TIMEOUT) else: self.api.create_namespaced_secret(namespace=self.namespace, body=new_pull_secret, _request_timeout=API_TIMEOUT) elif current_pull_secret: # If there is a password set in kubernetes, but not in our configuration clear it out self.api.delete_namespaced_secret(pull_secret_name, self.namespace, _request_timeout=API_TIMEOUT) try: self.batch_api.delete_namespaced_job(name=name, namespace=self.namespace, propagation_policy='Background', _request_timeout=API_TIMEOUT) while True: self.batch_api.read_namespaced_job(namespace=self.namespace, name=name, _request_timeout=API_TIMEOUT) time.sleep(1) except ApiException: pass volumes = [] volume_mounts = [] for index, mnt in enumerate(mounts): volumes.append(V1Volume( name=f'mount-{index}', persistent_volume_claim=V1PersistentVolumeClaimVolumeSource( claim_name=mnt['volume'], read_only=False ), )) volume_mounts.append(V1VolumeMount( name=f'mount-{index}', mount_path=mnt['dest_path'], sub_path=mnt['source_path'], read_only=False, )) if CONFIGURATION_CONFIGMAP: volumes.append(V1Volume( name='mount-configuration', config_map=V1ConfigMapVolumeSource( name=CONFIGURATION_CONFIGMAP ), )) volume_mounts.append(V1VolumeMount( name='mount-configuration', mount_path='/etc/assemblyline/config.yml', sub_path="config", read_only=True, )) section = 'service' labels = { 'app': 'assemblyline', 'section': section, 'privilege': 'core', 'component': 'update-script', } labels.update(self.extra_labels) metadata = V1ObjectMeta( name=name, labels=labels ) environment_variables = [V1EnvVar(name=_e.name, value=_e.value) for _e in docker_config.environment] environment_variables.extend([V1EnvVar(name=k, value=v) for k, v in env.items()]) environment_variables.extend([V1EnvVar(name=k, value=os.environ[k]) for k in INHERITED_VARIABLES if k in os.environ]) environment_variables.append(V1EnvVar(name="LOG_LEVEL", value=self.log_level)) cores = docker_config.cpu_cores memory = docker_config.ram_mb memory_min = min(docker_config.ram_mb_min, memory) container = V1Container( name=name, image=docker_config.image, command=docker_config.command, env=environment_variables, image_pull_policy='Always', volume_mounts=volume_mounts, resources=V1ResourceRequirements( limits={'cpu': cores, 'memory': f'{memory}Mi'}, requests={'cpu': cores / 4, 'memory': f'{memory_min}Mi'}, ) ) pod = V1PodSpec( volumes=volumes, restart_policy='Never', containers=[container], priority_class_name=self.priority_class, ) if use_pull_secret: pod.image_pull_secrets = [V1LocalObjectReference(name=pull_secret_name)] job = V1Job( metadata=metadata, spec=V1JobSpec( backoff_limit=1, completions=1, template=V1PodTemplateSpec( metadata=metadata, spec=pod ) ) ) status = self.batch_api.create_namespaced_job(namespace=self.namespace, body=job, _request_timeout=API_TIMEOUT).status if blocking: try: while not (status.failed or status.succeeded): time.sleep(3) status = self.batch_api.read_namespaced_job(namespace=self.namespace, name=name, _request_timeout=API_TIMEOUT).status self.batch_api.delete_namespaced_job(name=name, namespace=self.namespace, propagation_policy='Background', _request_timeout=API_TIMEOUT) except ApiException as error: if error.status != 404: raise
def create_simple_job_spec( self, experiment: Experiment, name: str, image: str, min_cpu: int, min_mem: int, command: Optional[List[str]] = None, env: Dict[str, str] = {}, accelerator: Optional[Accelerator] = None, accelerator_count: int = 1, namespace: str = k.DEFAULT_NAMESPACE, machine_type: Optional[MachineType] = None, preemptible: bool = True, preemptible_tpu: bool = True, tpu_driver: str = k.DEFAULT_TPU_DRIVER) -> Optional[JobSpec]: """creates a simple kubernetes job (1 container, 1 pod) JobSpec for this cluster Args: name: job name image: container image url (gcr.io/...) min_cpu: minimum cpu needed, in milli-cpu min_mem: minimum memory needed, in MB command: command to execute, None = container entrypoint args: args to pass to command env: environment vars for container accelerator: accelerator type, None=cpu only accelerator_count: accelerator count namespace: kubernetes namespace machine_type: machine type, None=default for mode (cpu/gpu) preemptible: use preemptible instance preemptible_tpu: use preemptible tpus tpu_driver: tpu driver to use Returns: JobSpec on success, None otherwise """ args = conf.experiment_to_args(experiment.kwargs, experiment.args) # ------------------------------------------------------------------------ # container # tpu/gpu resources container_resources = V1ResourceRequirements( requests=Cluster.container_requests(min_cpu, min_mem), limits=Cluster.container_limits( accelerator, accelerator_count, preemptible_tpu, ), ) container_env = [V1EnvVar(name=k, value=v) for k, v in env.items()] # this is a simple 1-container, 1-pod job, so we just name the # container the same thing (minus the generated suffix) as the job itself container = V1Container( name=name, image=image, command=command, args=args, resources=container_resources, env=container_env, image_pull_policy='Always', ) # ------------------------------------------------------------------------ # template # todo: should we support anything other than a 'never' restart policy? # see this for discussion # https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#pod-backoff-failure-policy tolerations = Cluster.tolerations(preemptible=preemptible) # backoff count plus 'OnFailure' may be correct here template_spec = V1PodSpec( restart_policy='Never', containers=[container], tolerations=tolerations, node_selector=Cluster.node_selector( preemptible=preemptible, machine_type=machine_type, accelerator=accelerator, ), host_ipc=True, ) template = V1PodTemplateSpec( metadata=Cluster.template_metadata( accelerator=accelerator, tpu_driver=tpu_driver, ), spec=template_spec, ) # ------------------------------------------------------------------------ # job job_spec = V1JobSpec(template=template, backoff_limit=4) return JobSpec.get_or_create( experiment=experiment, spec=ApiClient().sanitize_for_serialization(job_spec), platform=Platform.GKE, )
def construct_job_spec( env: str, team: str, env_context: Dict[str, Any], podsetting_metadata: Dict[str, Any], orbit_job_spec: kopf.Spec, labels: kopf.Labels, ) -> V1JobSpec: compute = orbit_job_spec.get("compute", { "computeType": "eks", "nodeType": "fargate" }) # Convert all the compute parameters to their SDK equivalents until we fix the SDK # and python-utils converted_compute = {} if "computeType" in compute: converted_compute["compute_type"] = compute["computeType"] if "nodeType" in compute: converted_compute["node_type"] = compute["nodeType"] if "env" in compute: converted_compute["env_vars"] = compute["env"] if "snsTopicName" in compute: converted_compute["sns.topic.name"] = compute["snsTopicName"] if "priorityClassName" in compute: converted_compute["priorityClassName"] = compute["priorityClassName"] if "podSetting" in compute: converted_compute["podsetting"] = compute["podSetting"] if "labels" in compute: converted_compute["labels"] = compute["labels"] if "container" in compute: if "concurrentProcesses" in compute["container"]: converted_compute["container"] = { "p_concurrent": compute["container"]["concurrentProcesses"] } pod_labels = { **labels, **orbit_job_spec.get("compute", {}).get("labels", {}), } pod_labels["app"] = "orbit-runner" pod_labels[f"orbit/{podsetting_metadata.get('name', None)}"] = "" pod_env = { "task_type": orbit_job_spec["taskType"], "tasks": json.dumps({"tasks": orbit_job_spec["tasks"]}), "compute": json.dumps({"compute": converted_compute}), "AWS_ORBIT_ENV": env, "AWS_ORBIT_TEAM_SPACE": team, } pod_image = (podsetting_metadata["image"] if podsetting_metadata["image"] is not None else (f"{env_context['Images']['JupyterUser']['Repository']}:" f"{env_context['Images']['JupyterUser']['Version']}")) pod_params = { # "name": f"run-{orbit_job_spec['taskType']}", "cmd": ["bash", "-c", "python /opt/python-utils/notebook_cli.py"], "port": 22, "image": pod_image, "service_account": "default-editor", "run_privileged": False, "allow_privilege_escalation": True, "env": pod_env, "priority_class_name": orbit_job_spec.get("priorityClassName"), "labels": pod_labels, "run_as_uid": 1000, "run_as_gid": 100, } pod = _make_pod(**pod_params) pod.spec.restart_policy = "Never" return V1JobSpec(backoff_limit=0, template=pod, ttl_seconds_after_finished=int( os.environ.get("TTL_SECONDS_AFTER_FINISHED", 120)))
def mock_job() -> V1Job: meta = V1ObjectMeta(namespace="default", name="some-job") spec = V1JobSpec(template=V1PodTemplate()) status = V1JobStatus(conditions=[]) return V1Job(metadata=meta, spec=spec, status=status)