def kube_create_job_object(name, container_image, namespace="default", container_name="jobcontainer", env_vars={}): # Body is the object Body body = client.V1Job(api_version="batch/v1", kind="Job") # Body needs Metadata # Attention: Each JOB must have a different name! body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) # And a Status body.status = client.V1JobStatus() # Now we start with the Template... template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() # Passing Arguments in Env: env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container(name=container_name, image=container_image, env=env_list) template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never') # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=30, template=template.template) return body
def _create_job_object(self, name: str, container_image: str, namespace: str = None, container_name: str = "servicecontainer", env_vars: dict = {}, command: list = [], active_deadline_seconds: int = 3600): namespace = self._get_namespace(namespace) body = client.V1Job(api_version="batch/v1", kind="Job") body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container(name=container_name, image=container_image, env=env_list, command=command) template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never') # Set active_deadline_seconds body.spec = client.V1JobSpec( ttl_seconds_after_finished=600, template=template.template, active_deadline_seconds=active_deadline_seconds) return body
def getBody(namespace='couture-console', jobname='nifi-test', containername='nifi-test', containerimage='sidharthc/nifi-test:alpha', env_vars=ENV_LIST, containerargs=['SFTP_TO_HDFS.py']): body = client.V1Job(api_version="batch/v1", kind="Job") # Body needs Metadata # Attention: Each JOB must have a different name! body.metadata = client.V1ObjectMeta(namespace=namespace, name=jobname) # And a Status body.status = client.V1JobStatus() # Now we start with the Template... template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container(name=containername, image=containerimage, args=containerargs, env=env_list) template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never') # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=100, template=template.template) return body
def test(): try: V1NamespaceBody = client.V1Namespace() pprint("V1NamespaceBody={}".format(V1NamespaceBody)) V1ObjectReferenceBody = client.V1ObjectReference() pprint("V1ObjectReferenceBody={}".format(V1ObjectReferenceBody)) V1ObjectMetaBody = client.V1ObjectMeta() pprint("V1ObjectMetaBody={}".format(V1ObjectMetaBody)) #V1BindingBody = client.V1Binding() #pprint("V1BindingBody={}".format(V1BindingBody)) V1ConfigMapBody = client.V1ConfigMap() pprint("V1ConfigMapBody={}".format(V1ConfigMapBody)) V1Pod = client.V1Pod() pprint("V1Pod={}".format(V1Pod)) V1PodTemplate = client.V1PodTemplate() pprint("V1PodTemplate={}".format(V1PodTemplate)) V1ReplicationController = client.V1ReplicationController() pprint("V1ReplicationController={}".format(V1ReplicationController)) V1Service = client.V1Service() pprint("V1Service={}".format(V1Service)) V1Node = client.V1Node() pprint("V1Node={}".format(V1Node)) pod = 'nginx-no-split-655b866cfd-54xmg' namespace='default' read_pod = apis_api.read_namespaced_pod(name=pod,namespace=namespace) pprint("read_pod={}".format(read_pod)) lifecycle=read_pod.spec.node_selector.lifecycle pprint("lifecycle={}".format(lifecycle)) read_pod.spec.node_selector.lifecycle='OnDemand' pprint("read_pod={}".format(read_pod)) #metadata = read_pod.metadata #pprint("metadata={}".format(metadata)) #metadata.cluster_name = 'Ec2SpotEKS4' #pprint("metadata={}".format(metadata)) except Exception as e: print("Exception when calling CoreV1Api->create_namespace: %s\n" % e)
def pod_template(containers, volumes=None, **kwargs): if volumes is not None and not volumes: volumes = None template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() template.template.spec = client.V1PodSpec(containers=containers, volumes=volumes, restart_policy='Never', **kwargs) return template
def scheduleJobs(): jobNames = [] for jobParameters in request.get_json(force=True): if not validateJobParameters(jobParameters): return abort(422, 'Invalid arguments') body = kubeClient.V1Job(api_version="batch/v1", kind="Job") # Body needs Metadata # Attention: Each JOB must have a different name! jobName = "r-job-" + str(uuid.uuid4()) body.metadata = kubeClient.V1ObjectMeta(namespace="default", name=jobName) # And a Status body.status = kubeClient.V1JobStatus() # Now we start with the Template... template = kubeClient.V1PodTemplate() template.template = kubeClient.V1PodTemplateSpec() # Passing Arguments in Env: env_list = createJobEnv(jobParameters, jobName) volume_mounts = kubeClient.V1VolumeMount(mount_path="/mydata", name="dose-volume") container = kubeClient.V1Container( name="r-container", image="monikeu/r-script-1:r-image-env", env=env_list, volume_mounts=[volume_mounts], image_pull_policy="Always") per_vol_claim = kubeClient.V1PersistentVolumeClaimVolumeSource( claim_name="dose-volume-claim") volume = kubeClient.V1Volume(name="dose-volume", persistent_volume_claim=per_vol_claim) template.template.spec = kubeClient.V1PodSpec(containers=[container], restart_policy='Never', volumes=[volume]) # And finaly we can create our V1JobSpec! body.spec = kubeClient.V1JobSpec(ttl_seconds_after_finished=600, template=template.template) try: response = api_instance.create_namespaced_job("default", body, pretty=True) pprint(response) jobNames.append(jobName) except ApiException as e: return "Error occurred during an attempt to create a job", e.status return 'Created one or more jobs: {}'.format(",".join(jobNames)), 201
def make_job(item): response = requests.get("http://localhost:8000/items/{}".format(item)) obj = json.loads(response.text) job = client.V1Job() job.metadata = client.V1ObjectMeta() job.metadata.name = item job.spec = client.V1JobSpec() job.spec.template = client.V1PodTemplate() job.spec.template.spec = client.V1PodTemplateSpec() job.spec.template.spec.restart_policy = "Never" job.spec.template.spec.containers = [ make_container(item, obj) ] return job
def _get_job_object(self, algorithm): job = client.V1Job() # Define job metadata job.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=self.resources_identifier) # Define job spec template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() env_list = [] env_list.append( client.V1EnvVar(name=ENV_VAR_JOB_NAME, value=self.resources_identifier)) docker_repo = os.environ.get(ENV_VAR_DOCKER_REPOSITORY, "") if docker_repo != "": image_name = f"{docker_repo}/{algorithm}:latest" else: image_name = f"{algorithm}:latest" volume_mounts = [ client.V1VolumeMount(name=VOLUME_NAME_ALGORITHM_INPUT, mount_path="/etc/config") ] container = client.V1Container(name="algorithm", image=image_name, volume_mounts=volume_mounts, env=env_list, image_pull_policy="Always") # command=["sleep", "5"]) cm_mount = client.V1ConfigMapVolumeSource( name=self.resources_identifier) volumes = [ client.V1Volume(config_map=cm_mount, name=VOLUME_NAME_ALGORITHM_INPUT) ] template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=volumes) job.spec = client.V1JobSpec(ttl_seconds_after_finished=1200, template=template.template) return job
def create_job_object(name: str, container_image: str, env_list: dict, command: List[str], command_args: List[str], volumes: List[Dict], init_containers: List[Dict], output: Output, namespace: str = "stackl", container_name: str = "jobcontainer", api_version: str = "batch/v1", image_pull_policy: str = "Always", ttl_seconds_after_finished: int = 3600, restart_policy: str = "Never", backoff_limit: int = 0, active_deadline_seconds: int = 3600, service_account: str = "stackl-agent-stackl-agent", image_pull_secrets: List[str] = [], labels=None) -> client.V1Job: # pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements """Creates a Job object using the Kubernetes client :param name: Job name affix :type name: str :param container_image: automation container image :type container_image: str :param env_list: Dict with key/values for the environment inside the automation container :type env_list: dict :param command: entrypoint command :type command: List[str] :param command_args: command arguments :type command_args: List[str] :param volumes: volumes and volumemounts :type volumes: List[Dict] :param image_pull_secrets: secrets to pull images :type image_pull_secrets: List[str] :param init_containers: list with init_containers :type init_containers: List[Dict] :param output: output Object :type output: Output :param namespace: Kubernetes namespace, defaults to "stackl" :type namespace: str, optional :param container_name: name of automation container, defaults to "jobcontainer" :type container_name: str, optional :param api_version: Job api version, defaults to "batch/v1" :type api_version: str, optional :param image_pull_policy: always pull latest images, defaults to "Always" :type image_pull_policy: str, optional :param ttl_seconds_after_finished: Remove jobs after execution with ttl, defaults to 600 :type ttl_seconds_after_finished: int, optional :param restart_policy: Restart the pod on the same node after failure, defaults to "Never" :type restart_policy: str, optional :param backoff_limit: Retries after failure, defaults to 0 :type backoff_limit: int, optional :param active_deadline_seconds: Timeout on a job, defaults to 3600 seconds :type active_deadline_seconds: int, optional :param service_account: Kubernetes service account, defaults to "stackl-agent-stackl-agent" :type service_account: str, optional :param labels: metadata labels, defaults to {} :type labels: dict, optional :return: automation Job object :rtype: client.V1Job """ id_job = id_generator() name = name + "-" + id_job body = client.V1Job(api_version=api_version, kind="Job") body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() k8s_volumes = [] cms = [] logging.debug(f"volumes: {volumes}") # create a k8s volume for each element in volumes for vol in volumes: vol_name = name + "-" + vol["name"] k8s_volume = client.V1Volume(name=vol_name) if vol["type"] == "config_map": config_map = client.V1ConfigMapVolumeSource() config_map.name = vol_name k8s_volume.config_map = config_map cms.append(create_cm(vol_name, namespace, vol['data'])) vol['name'] = vol_name if vol["type"] == "empty_dir": k8s_volume.empty_dir = client.V1EmptyDirVolumeSource( medium="Memory") vol['name'] = vol_name k8s_volumes.append(k8s_volume) logging.debug(f"Volumes created for job {name}: {k8s_volumes}") # create a volume mount for each element in volumes k8s_volume_mounts = [] for vol in volumes: if vol["mount_path"]: volume_mount = client.V1VolumeMount(name=vol["name"], mount_path=vol["mount_path"]) if "sub_path" in vol: volume_mount.sub_path = vol["sub_path"] k8s_volume_mounts.append(volume_mount) logging.debug(f"Volume mounts created for job {name}: {k8s_volume_mounts}") # create an environment list k8s_env_list = [] if env_list: for key, value in env_list.items(): if isinstance(value, dict): if 'config_map_key_ref' in value: k8s_env_from = client.V1EnvVar( name=key, value_from=client.V1EnvVarSource( config_map_key_ref=client.V1ConfigMapKeySelector( name=value['config_map_key_ref']["name"], key=value['config_map_key_ref']["key"]))) k8s_env_list.append(k8s_env_from) elif 'field_ref' in value: k8s_env_from = client.V1EnvVar( name=key, value_from=client.V1EnvVarSource( field_ref=client.V1ObjectFieldSelector( field_path=value['field_ref']))) k8s_env_list.append(k8s_env_from) else: k8s_env = client.V1EnvVar(name=key, value=value) k8s_env_list.append(k8s_env) k8s_env_from_list = [] # if env_from: # for env in env_from: # if 'config_map_ref' in env: # k8s_env_from = client.V1EnvFromSource( # config_map_ref=env['config_map_ref']) # k8s_env_from_list.append(k8s_env_from) # elif 'secret_ref' in env: # k8s_env_from = client.V1EnvFromSource( # secret_ref=env['secret_ref']) # k8s_env_from_list.append(k8s_env_from) logging.debug(f"Environment list created for job {name}: {k8s_env_list}") print(f"Environment list created for job {name}: {k8s_env_list}") container = client.V1Container(name=container_name, image=container_image, env=k8s_env_list, volume_mounts=k8s_volume_mounts, image_pull_policy=image_pull_policy, command=command, args=command_args, env_from=k8s_env_from_list) k8s_init_containers = [] logging.debug(f"Init containers for job {name}: {init_containers}") for c in init_containers: k8s_c = client.V1Container(name=c['name'], image=c['image'], volume_mounts=k8s_volume_mounts, env=k8s_env_list) if 'args' in c: k8s_c.args = c['args'] k8s_init_containers.append(k8s_c) k8s_secrets = [] for secret in image_pull_secrets: k8s_secrets.append(client.V1LocalObjectReference(name=secret)) logging.debug(f"Secret list created for job {name}: {k8s_secrets}") containers = [container] if output: output.volume_mounts = k8s_volume_mounts output.env = k8s_env_list output_containers = output.containers containers = containers + output_containers template.template.metadata = client.V1ObjectMeta(labels=labels) template.template.spec = client.V1PodSpec( containers=containers, restart_policy=restart_policy, image_pull_secrets=k8s_secrets, volumes=k8s_volumes, init_containers=k8s_init_containers, service_account_name=service_account) template.template = client.V1PodTemplateSpec( metadata=template.template.metadata, spec=template.template.spec) body.spec = client.V1JobSpec( ttl_seconds_after_finished=ttl_seconds_after_finished, template=template.template, backoff_limit=backoff_limit, active_deadline_seconds=active_deadline_seconds) return body, cms
def create_kb8s_job(workflow_id, minion_cmd, cluster): configuration = client.Configuration() configuration.host = cluster['address'] configuration.verify_ssl = False configuration.debug = False if 'general_parameters' not in cluster: raise ValueError('Incorrect cluster config.') cluster_params = {} for parameter in cluster['general_parameters'].split(','): key, value = parameter.split('=') if key.startswith('kubernetes'): cluster_params[key] = value env_vars = { 'HADOOP_CONF_DIR': '/usr/local/juicer/conf', } token = cluster['auth_token'] configuration.api_key = {"authorization": "Bearer " + token} # noinspection PyUnresolvedReferences client.Configuration.set_default(configuration) job = client.V1Job(api_version="batch/v1", kind="Job") name = 'job-{}'.format(workflow_id) container_name = 'juicer-job' container_image = cluster_params['kubernetes.container'] namespace = cluster_params['kubernetes.namespace'] pull_policy = cluster_params.get('kubernetes.pull_policy', 'Always') gpus = int(cluster_params.get('kubernetes.resources.gpus', 0)) print('-' * 30) print('GPU KB8s specification: ' + str(gpus)) print('-' * 30) log.info('GPU specification: %s', gpus) job.metadata = client.V1ObjectMeta(namespace=namespace, name=name) job.status = client.V1JobStatus() # Now we start with the Template... template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() # Passing Arguments in Env: env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) # Subpath implies that the file is stored as a config map in kb8s volume_mounts = [ client.V1VolumeMount( name='juicer-config', sub_path='juicer-config.yaml', mount_path='/usr/local/juicer/conf/juicer-config.yaml'), client.V1VolumeMount( name='hdfs-site', sub_path='hdfs-site.xml', mount_path='/usr/local/juicer/conf/hdfs-site.xml'), client.V1VolumeMount(name='hdfs-pvc', mount_path='/srv/storage/'), ] pvc_claim = client.V1PersistentVolumeClaimVolumeSource( claim_name='hdfs-pvc') if gpus: resources = {'limits': {'nvidia.com/gpu': gpus}} else: resources = {} container = client.V1Container(name=container_name, image=container_image, env=env_list, command=minion_cmd, image_pull_policy=pull_policy, volume_mounts=volume_mounts, resources=resources) volumes = [ client.V1Volume( name='juicer-config', config_map=client.V1ConfigMapVolumeSource(name='juicer-config')), client.V1Volume( name='hdfs-site', config_map=client.V1ConfigMapVolumeSource(name='hdfs-site')), client.V1Volume(name='hdfs-pvc', persistent_volume_claim=pvc_claim), ] template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=volumes) # And finally we can create our V1JobSpec! job.spec = client.V1JobSpec(ttl_seconds_after_finished=10, template=template.template) api = client.ApiClient(configuration) batch_api = client.BatchV1Api(api) try: batch_api.create_namespaced_job(namespace, job, pretty=True) except ApiException as e: body = json.loads(e.body) if body['reason'] == 'AlreadyExists': pass else: print("Exception when calling BatchV1Api->: {}\n".format(e))
def kube_create_job_object( self, container_image: str, command: t.List[str] = None, namespace: str = "default", env_vars: dict = None, afs_volume_name: str = None, azure_mount_path: str = None, volume_sub_path: str = None, ): """ Create a k8 Job Object Minimum definition of a job object: {'api_version': None, - Str 'kind': None, - Str 'metadata': None, - Metada Object 'spec': None, -V1JobSpec 'status': None} - V1Job Status Docs: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Job.md Docs2: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#writing-a-job-spec Also docs are pretty pretty bad. Best way is to ´pip install kubernetes´ and go via the autogenerated code And figure out the chain of objects that you need to hold a final valid object So for a job object you need: V1Job -> V1ObjectMeta -> V1JobStatus -> V1JobSpec -> V1PodTemplate -> V1PodTemplateSpec -> V1Container Now the tricky part, is that V1Job.spec needs a .template, but not a PodTemplateSpec, as such you need to build a PodTemplate, add a template field (template.template) and make sure template.template.spec is now the PodSpec. Then, the V1Job.spec needs to be a JobSpec which has a template the template.template field of the PodTemplate. Failure to do so will trigger an API error. Also Containers must be a list! Docs3: https://github.com/kubernetes-client/python/issues/589 """ # Body is the object Body body = client.V1Job(api_version="batch/v1", kind="Job") # Body needs Metadata # Attention: Each JOB must have a different name! name = id_generator() body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) # And a Status body.status = client.V1JobStatus() # Now we start with the Template... template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() # Passing Arguments in Env: env_list = [] if env_vars: for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) if volume_sub_path is not None: volumes, volume_mounts = self.prepare_azure_volumes( volume_sub_path=volume_sub_path, afs_volume_name=afs_volume_name, azure_mount_path=azure_mount_path) container = client.V1Container(name=name, image=container_image, env=env_list, command=command, volume_mounts=volume_mounts) template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=volumes) # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=600, template=template.template) return body
def start_job(self, nuts_id): input_path = self.input_path output_path = self.pers_path + '/' + self.experiment_id pers_path = self.pers_path resolution = self.resolution experiment_id = self.experiment_id nuts_id = nuts_id.lower() container_name = 'br-process-raster' container_image = 'harbor.tilyes.eu/eugis/br-process-raster' body = client.V1Job(api_version="batch/v1", kind="Job") body.metadata = client.V1ObjectMeta( namespace=self.namespace, name=f'{container_name}-{experiment_id}-{nuts_id}') body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() container = client.V1Container( name=container_name, image=container_image, resources=client.V1ResourceRequirements(requests={ 'cpu': '1', 'memory': '4Gi', }, limits={ 'cpu': '2', 'memory': '6Gi', }), volume_mounts=[ client.V1VolumeMount(mount_path=self.input_path, name="source"), client.V1VolumeMount(mount_path=self.pers_path, name="persistence"), client.V1VolumeMount(mount_path="/app/config/", name="hu-raster-config") ], command=[ 'sh', '-c', f'python3 worker.py --input-path {input_path} --resolution {resolution} --output-path {output_path} --pers-path {pers_path} --nuts-id {nuts_id} && curl -X POST http://localhost:15020/quitquitquit' ]) template.template.spec = client.V1PodSpec( containers=[container], volumes=[ client.V1Volume(name='source', persistent_volume_claim=client. V1PersistentVolumeClaimVolumeSource( claim_name=self.source_pvc)), client.V1Volume(name='persistence', persistent_volume_claim=client. V1PersistentVolumeClaimVolumeSource( claim_name=self.persistence_pvc)), client.V1Volume( name="hu-raster-config", config_map=client.V1ConfigMapVolumeSource( name=f"kf-pipeline-hu-raster-config-{experiment_id}")) ], restart_policy='Never') body.spec = client.V1JobSpec(template=template.template, ttl_seconds_after_finished=10) self.delete_job(f'{container_name}-{experiment_id}-{nuts_id}') self.v1.create_namespaced_job(self.namespace, body, pretty='true') return f'{container_name}-{experiment_id}-{nuts_id}'
def create_job_object(name, container_image, command, args=None, namespace="default", container_name="jobcontainer", env_vars=None, restart_policy='Never', ttl_finished=180, secret_names=None, backoff_limit=0, volume_mappings=None): if settings.TASK_DELETE_SUCCESSFUL_PODS or settings.TASK_DELETE_FAILED_PODS: cleanup_pods(delete_succeeded=settings.TASK_DELETE_SUCCESSFUL_PODS, delete_failed=settings.TASK_DELETE_FAILED_PODS, namespace=namespace) if env_vars is None: env_vars = {} if secret_names is None: secret_names = [] if args is None: args = [] if volume_mappings is None: volume_mappings = [] body = client.V1Job(api_version="batch/v1", kind="Job") # metadata and status are required body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() api_client = client.BatchV1Api() # Set env variables env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) env_from = [] for secret_name in secret_names: env_from.append( client.V1EnvFromSource(secret_ref=client.V1SecretEnvSource( name=secret_name))) volumes = [] volume_mounts = [] for i, volume_mapping in enumerate(volume_mappings): volume = client.V1Volume(name=f'volume-{i}', host_path=client.V1HostPathVolumeSource( path=volume_mapping['host_path'])) volumes.append(volume) volume_mounts.append( client.V1VolumeMount(name=f'volume-{i}', mount_path=volume_mapping['mount_path'])) # set container options container = client.V1Container( name=container_name, image=container_image, env=env_list, command=command, args=args, env_from=env_from, volume_mounts=volume_mounts, image_pull_policy=settings.TASK_IMAGE_PULL_POLICY) # set pod options template.template.spec = client.V1PodSpec( containers=[container], restart_policy=restart_policy, volumes=volumes, service_account_name='collabovid-sa') body.spec = client.V1JobSpec(ttl_seconds_after_finished=ttl_finished, template=template.template, backoff_limit=backoff_limit) return body
def submit_job(args, command=None): container_image = args.container container_name = args.name body = client.V1Job(api_version="batch/v1", kind="Job", metadata=client.V1ObjectMeta(name=container_name)) body.status = client.V1JobStatus() template = client.V1PodTemplate() labels = { 'hugin-job': "1", 'hugin-job-name': f'{container_name}' } template.template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(labels=labels) ) tolerations = [] env = [] if args.environment: for env_spec in args.environment: env_name,env_value = env_spec.split("=", 1) env.append(client.V1EnvVar(name=env_name, value=env_value)) containe_args = dict( name=f"container-{container_name}", image=container_image, env=env, ) if args.gpu: tolerations.append(client.V1Toleration( key='nvidia.com/gpu', operator='Exists', effect='NoSchedule')) containe_args['resources'] = client.V1ResourceRequirements(limits={"nvidia.com/gpu": 1}) if command or args.command: containe_args['command'] = command if command else args.command container = client.V1Container(**containe_args) pull_secrets = [] if args.pull_secret is not None: pull_secrets.append(client.V1LocalObjectReference(name=args.pull_secret)) pod_args = dict(containers=[container], restart_policy='Never', image_pull_secrets=pull_secrets) if tolerations: pod_args['tolerations'] = tolerations if args.node_selector is not None: parts = args.node_selector.split("=", 1) if len(parts) == 2: affinity = client.V1Affinity( node_affinity=client.V1NodeAffinity( required_during_scheduling_ignored_during_execution=client.V1NodeSelector( node_selector_terms=[client.V1NodeSelectorTerm( match_expressions=[client.V1NodeSelectorRequirement( key=parts[0], operator='In', values=[parts[1]])] )] ) ) ) pod_args['affinity'] = affinity template.template.spec = client.V1PodSpec(**pod_args) body.spec = client.V1JobSpec(ttl_seconds_after_finished=1800, template=template.template) try: api_response = batch_v1.create_namespaced_job("default", body, pretty=True) #print (api_response) except client.exceptions.ApiException as e: logging.critical(f"Failed to start job: {e.reason}")
def create_job(name, configmap_name, container_name, container_image, container_command, namespace="default", env_vars={}): """ Create a k8 Job Object Args: name: configmap_name: container_name: container_image: container_command:list类型,执行程序的命令,例如:['python','/home/test.py'] namespace: env_vars: 环境变量 Returns: """ try: # Body是对象体 body = client.V1Job(api_version="batch/v1", kind="Job") # 对象需要 Metadata,每个JOB必须有一个不同的名称! body.metadata = client.V1ObjectMeta(namespace=namespace, name=name) # 添加 Status body.status = client.V1JobStatus() # 开始 Template... template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() # 在Env中传递Arguments: env_list = [] for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container(name=container_name, image=container_image, env=env_list) container.command = container_command container.image_pull_policy = "IfNotPresent" volume_mount = client.V1VolumeMount(name="config-volume", mount_path=mount_path) container.volume_mounts = [volume_mount] config_map = client.V1ConfigMapVolumeSource(name=configmap_name) volumes = client.V1Volume(name="config-volume", config_map=config_map) template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never', volumes=[volumes], node_selector={'gpu': 'true'}) # volumes = [volumes]) # 最后,创建V1JobSpec body.spec = client.V1JobSpec(ttl_seconds_after_finished=600, template=template.template) response = batch_v1_api.create_namespaced_job(namespace, body, pretty=True) return True, response except Exception as ex: print(ex) return False, "k8 Job Object creates Failed!"
def create_job(MODEL): assert MODEL is not None, "model name is None, cannot spawn a new worker" api = client.BatchV1Api() body = client.V1Job(api_version="batch/v1", kind="Job") name = 'speechlab-worker-job-{}-{}'.format(MODEL.lower().replace("_", "-"), id_generator()) body.metadata = client.V1ObjectMeta(namespace=NAMESPACE, name=name) body.status = client.V1JobStatus() template = client.V1PodTemplate() template.template = client.V1PodTemplateSpec() template.template.metadata = client.V1ObjectMeta( annotations={ "prometheus.io/scrape": "true", "prometheus.io/port": "8081" }) azure_file_volume = client.V1AzureFileVolumeSource( read_only=True, secret_name=MODELS_FILESHARE_SECRET, share_name=MODELS_SHARE_NAME) volume = client.V1Volume(name="models-azurefiles", azure_file=azure_file_volume) env_vars = { "AZURE_STORAGE_ACCOUNT": AZURE_STORAGE_ACCOUNT, "AZURE_STORAGE_ACCESS_KEY": AZURE_STORAGE_ACCESS_KEY, "AZURE_CONTAINER": AZURE_CONTAINER, "MASTER": MASTER, "NAMESPACE": NAMESPACE, "RUN_FREQ": "ONCE", "MODEL_DIR": MODEL, # important "MODELS_FILESHARE_SECRET": MODELS_FILESHARE_SECRET, "MODELS_SHARE_NAME": MODELS_SHARE_NAME } env_list = [] if env_vars: for env_name, env_value in env_vars.items(): env_list.append(client.V1EnvVar(name=env_name, value=env_value)) container = client.V1Container( name='{}-c'.format(name), image=IMAGE, image_pull_policy="IfNotPresent", command=[ "/home/appuser/opt/tini", "--", "/home/appuser/opt/start_worker.sh" ], env=env_list, ports=[client.V1ContainerPort(container_port=8081, name="prometheus")], security_context=client.V1SecurityContext( privileged=True, capabilities=client.V1Capabilities(add=["SYS_ADMIN"])), resources=client.V1ResourceRequirements(limits={ "memory": "5G", "cpu": "1" }, requests={ "memory": "5G", "cpu": "1" }), volume_mounts=[ client.V1VolumeMount(mount_path="/home/appuser/opt/models", name="models-azurefiles", read_only=True) ]) template.template.spec = client.V1PodSpec( containers=[container], image_pull_secrets=[{ "name": "azure-cr-secret" }], # reason to use OnFailure https://github.com/kubernetes/kubernetes/issues/20255 restart_policy="OnFailure", volumes=[volume]) # And finaly we can create our V1JobSpec! body.spec = client.V1JobSpec(ttl_seconds_after_finished=100, template=template.template) try: api_response = api.create_namespaced_job(NAMESPACE, body) print("api_response=" + str(api_response)) return True except ApiException as e: logging.exception('error spawning new job') print("Exception when creating a job: %s\n" % e)