def run_kube_job(job_spec: dict, envs: dict, job_folder: str, timeout: int) -> str: job_tag = "-".join(job_folder.split("/")[-2:]) job_uuid: str = f"ek-{str(uuid.uuid4())[:5]}-{job_tag}" job_spec["metadata"]["name"] = job_spec["metadata"]["name"].format( job_uuid) # DEPRECATED FOR USAGE WITH AZCOPY # job_spec["spec"]["template"]["spec"]["volumes"][0]["hostPath"]["path"] = job_folder job_spec["spec"]["template"]["spec"]["containers"][0]["env"] = to_kube_env( envs) job = pykube.Job(api, job_spec) job.create() start = datetime.datetime.now() status = "start" logging.info(f"JOB: {job_uuid} was started. Tag is {job_tag}") while (datetime.datetime.now() - start).seconds < timeout: try: time.sleep(10) job.reload() status = status_checker(job=job) if status == "succeeded": logging.info( f"JOB: {job_uuid} finished. Output in {job_folder}") job.delete("Foreground") return status except requests.exceptions.HTTPError as exc: print(f"{exc} {traceback.print_exc()}") print(f"Timeout {timeout} was exceeded. Deleting the job {job_uuid}") job.delete("Foreground") return status
def create_job(job_id, docker_img, cmd, volumes, env_vars, namespace): job = { 'kind': 'Job', 'apiVersion': 'batch/v1', 'metadata': { 'name': job_id, 'namespace': namespace }, 'spec': { 'autoSelector': True, 'template': { 'metadata': { 'name': job_id }, 'spec': { 'containers': [ { 'name': job_id, 'image': docker_img }, ], 'restartPolicy': 'OnFailure' } } } } import shlex if cmd: (job['spec']['template']['spec']['containers'][0]['command'] ) = shlex.split(cmd) if env_vars: job['spec']['template']['spec']['containers'][0]['env'] = [] for var, value in env_vars.items(): job['spec']['template']['spec']['containers'][0]['env'].append({ 'name': var, 'value': value }) if volumes: job['spec']['template']['spec']['containers'][0]['volumeMounts'] = [] job['spec']['template']['spec']['volumes'] = [] for volume, mount_path in volumes: (job['spec']['template']['spec']['containers'][0] ['volumeMounts'].append({ 'name': volume['name'], 'mountPath': mount_path })) job['spec']['template']['spec']['volumes'].append(volume) # add better handling try: job_obj = pykube.Job(api, job) job_obj.create() return job_obj except pykube.exceptions.HTTPError: return None
def run_caw(data="/work/apps/pipeline_test/data/",name="cawcl",pdName="caw",memory="5Gi",cpu="2",threads="2"): api = pykube.HTTPClient(pykube.KubeConfig.from_url("http://localhost:8001")) obj = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": name }, "spec": { "template": { "metadata": { "name": "caw" }, "spec": { "volumes": [ { "name": "myapp-persistent-job", "gcePersistentDisk": { "pdName": pdName, "fsType": "ext4" } } ], "containers": [ { "name": "caw", "image": "firog/ubuntujava", "command": [ "bash", "/work/apps/pipeline_test/flexible_location_pipeline.sh", "/work/apps/pipeline_test/scratch/", "/work/apps/", "/work/apps/pipeline_test/ref/", data, threads ], "resources": { "requests": { "memory": memory, "cpu": cpu } }, "volumeMounts": [ { "name": "myapp-persistent-job", "mountPath": "/work" } ] } ], "restartPolicy": "Never" } } } } pykube.Job(api, obj).create()
def create_job_with_message(self, input_blob_loc, output_blob_loc, output_file_name): spec_file = os.getcwd() + '/workloads/' + os.environ.get('WORKLOAD') with open(spec_file, 'r') as spec: lines = spec.read() templated = self.inject_env(input_blob_loc, output_blob_loc, output_file_name, lines) try: json_spec = json.loads(templated) except Exception: print("failed to read json spec") return pykube.Job(self.api, json_spec).create()
def launch_k8s(): ''' Launch a k8s job as specified by the above environment variables. ''' kube_config = { "name": os.environ["kube_name"], "server": os.environ["kube_server"], "certificate-authority-data": os.environ["kube_cert_auth"].rstrip(), "user": os.environ["kube_user"], "pass": os.environ["kube_pass"] } kube_doc = """ apiVersion: v1 kind: Config preferences: {{}} clusters: - cluster: certificate-authority-data: {certificate-authority-data} server: {server} name: {name} contexts: - context: cluster: {name} user: {name} name: {name} current-context: {name} users: - name: {name} user: username: {user} password: {pass} """.format(**kube_config) with open('/tmp/kube_config', 'w') as kube_config_file: kube_config_file.write(kube_doc) kube_config_file.close() config = pykube.KubeConfig.from_file("/tmp/kube_config") api = pykube.HTTPClient(config) cmd_prefix = 'kube_cmd_' cmd_array = [ v for k, v in sorted(os.environ.items()) \ if k.startswith(cmd_prefix) and len(k) > len(cmd_prefix)] default_env = [ {"name": "AWS_ACCESS_KEY_ID", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "access_key_id" } } }, {"name": "AWS_SECRET_ACCESS_KEY", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "secret_access_key" } } }, {"name": "AWS_DEFAULT_REGION", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "default_region" } } } ] env_forward_prefix = 'k8s_job_' forward_env = [ {'name': k[len(env_forward_prefix):], 'value' : v} \ for k,v in os.environ.items() \ if k.startswith(env_forward_prefix) and len(k) > len(env_forward_prefix) ] job_env = default_env + forward_env job_name = os.environ["kube_job_name"] job_container = { "name" : job_name, "image" : os.environ["kube_image"], "env" : job_env } if len(cmd_array) > 0: job_container['command'] = cmd_array job_spec = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": job_name }, "spec": { "template": { "metadata": { "name": job_name }, "spec": { "containers": [job_container], "restartPolicy": "Never" } } } } job = pykube.Job(api, job_spec) if job.exists(): # Refresh the job execution metadata. checkJob = copy.deepcopy(job) checkJob.reload() reloadJob = False print('Current {} job status : {}'.format(job_name, json.dumps(checkJob.obj['status']))) if 'active' in checkJob.obj['status']: # jobStart = dateutil.parser.parse(j.obj['status']['startTime']).replace(tzinfo=None) # jobExpiry = datetime.utcnow() - timedelta(minutes=expiryMinutes) # reloadJob = jobStart <= jobExpiry: reloadJob = False else: reloadJob = True if reloadJob: print('Reloading {} job'.format(job_name)) # Clean up pods, leaving at most 10 stale containers on k8s. pods = pykube.Pod.objects(api).filter(namespace="default", selector={"job-name": job_name}) if len(pods) > 10: sortedPods = sorted(list(pods), key=lambda pod: pod.obj['status']['startTime']) while len(sortedPods) > 10: pod = sortedPods.pop(0) print('Deleting {} pod {}'.format(job_name, pod.name)) pod.delete() job.delete() job.create() else: print('Creating {} job'.format(job_name)) job.create()
def handler(event, context): kube_config = { "name": os.environ["kube_name"], "server": os.environ["kube_server"], "certificate-authority-data": os.environ["kube_cert_auth"].rstrip(), "user": os.environ["kube_user"], "pass": os.environ["kube_pass"] } kube_doc = """ apiVersion: v1 kind: Config preferences: {{}} clusters: - cluster: certificate-authority-data: {certificate-authority-data} server: {server} name: {name} contexts: - context: cluster: {name} user: {name} name: {name} current-context: {name} users: - name: {name} user: username: {user} password: {pass} """.format(**kube_config) with open('/tmp/kube_config', 'w') as kube_config_file: kube_config_file.write(kube_doc) kube_config_file.close() config = pykube.KubeConfig.from_file("/tmp/kube_config") api = pykube.HTTPClient(config) cmd_prefix = 'kube_cmd_' cmd_array = [ v for k, v in sorted(os.environ.items()) \ if k.startswith(cmd_prefix) and len(k) > len(cmd_prefix)] default_env = [{ "name": "AWS_ACCESS_KEY_ID", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "access_key_id" } } }, { "name": "AWS_SECRET_ACCESS_KEY", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "secret_access_key" } } }, { "name": "AWS_DEFAULT_REGION", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "default_region" } } }] env_forward_prefix = 'k8s_job_' forward_env = [ {'name': k[len(env_forward_prefix):], 'value' : v} \ for k,v in os.environ.items() \ if k.startswith(env_forward_prefix) and len(k) > len(env_forward_prefix) ] job_env = default_env + forward_env job_name = os.environ["kube_job_name"] job_container = { "name": job_name, "image": os.environ["kube_image"], "env": job_env, } if "kube_privileged" in os.environ: p_mode = os.environ['kube_privileged'] print('Job {} privileged mode: {}'.format(job_name, p_mode)) job_container['securityContext'] = {'privileged': bool(p_mode)} job_requests = {} if "kube_cpu_requests" in os.environ: cpu_resources = os.environ['kube_cpu_requests'] job_requests['cpu'] = str(cpu_resources) if "kube_mem_requests" in os.environ: mem_resources = os.environ['kube_mem_requests'] job_requests['memory'] = str(mem_resources) if job_requests: job_container['resources'] = {'requests': job_requests} print('Job {} resources: {}'.format( job_name, str(json.dumps(job_container['resources'])))) if len(cmd_array) > 0: job_container['command'] = cmd_array pod_spec = {"containers": [job_container], "restartPolicy": "Never"} if "kube_nodegroup" in os.environ and os.environ[ 'kube_nodegroup'] is not None: node_group = os.environ['kube_nodegroup'] print('Job {} nodeGroup: {}'.format(job_name, node_group)) pod_spec['nodeSelector'] = {'opsdx_nodegroup': node_group} batch_job_spec = { "template": { "metadata": { "name": job_name }, "spec": pod_spec } } if "kube_active_deadline_seconds" in os.environ: try: deadline = int(os.environ['kube_active_deadline_seconds']) print('Job {} activeDeadlineSeconds: {}'.format( job_name, deadline)) batch_job_spec['activeDeadlineSeconds'] = deadline except ValueError: print( 'Invalid kube_active_deadline_seconds environment variable, skipping deadline.' ) job_spec = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": job_name }, "spec": batch_job_spec } print("job spec:") print(job_spec) job = pykube.Job(api, job_spec) if job.exists(): # Refresh the job execution metadata. checkJob = copy.deepcopy(job) checkJob.reload() reloadJob = False print('Current {} job status : {}'.format( job_name, json.dumps(checkJob.obj['status']))) if 'active' in checkJob.obj['status']: # jobStart = dateutil.parser.parse(j.obj['status']['startTime']).replace(tzinfo=None) # jobExpiry = datetime.utcnow() - timedelta(minutes=expiryMinutes) # reloadJob = jobStart <= jobExpiry: reloadJob = False else: reloadJob = True if reloadJob: print('Reloading {} job'.format(job_name)) # Clean up pods, leaving at most 10 stale containers on k8s. pods = pykube.Pod.objects(api).filter( namespace="default", selector={"job-name": job_name}) if len(pods) > 10: sortedPods = sorted( list(pods), key=lambda pod: pod.obj['status']['startTime']) while len(sortedPods) > 10: pod = sortedPods.pop(0) print('Deleting {} pod {}'.format(job_name, pod.name)) pod.delete() job.delete() job.create() else: print('Creating {} job'.format(job_name)) job.create()
def create_job(self, file_name): with open(file_name) as stream: content = yaml.load(stream.read()) pykube.Job(self.api, content).create()
def run_simulation(magnet_config, job_uuid, n_jobs, n_events, input_file, const_field): # make random directory for ship docker # to store input files and output files input_dir = 'input_dir_{}'.format(job_uuid) flask_host_dir = '{}/{}'.format(config.FLASK_CONTAINER_DIRECTORY, input_dir) flask_host_dir = os.path.abspath(flask_host_dir) # host_outer_dir = '{}/{}'.format(config.HOST_DIRECTORY, input_dir) az_outer_dir = '{}/{}'.format(config.AZ_DIRECTORY, input_dir) os.mkdir(flask_host_dir) sys.stdout = open(os.path.join(flask_host_dir, "out.txt"), "w", buffering=1) sys.stderr = open(os.path.join(flask_host_dir, "err.txt"), "w", buffering=1) # save magnet config for ship # in host directory magnet_config_path = os.path.join(flask_host_dir, "magnet_params.json") with open(magnet_config_path, 'w', encoding='utf-8') as f: json.dump(magnet_config, f, ensure_ascii=False, indent=4) result = { 'uuid': None, 'container_id': None, 'container_status': 'starting', 'message': None } redis.set(job_uuid, json.dumps(result)) job_spec_config_file = os.path.join(flask_host_dir, "job_spec.json") jobs = [] uuids = [] N_EVENTS = config.EVENTS_TOTAL if n_events is None else min( n_events, config.EVENTS_TOTAL) chunk_size = N_EVENTS // n_jobs for part_number in range(n_jobs): start_event_num = chunk_size * part_number if part_number + 1 == n_jobs: chunk_size += N_EVENTS % n_jobs - 1 JOB_SPEC = deepcopy(config.JOB_SPEC) flask_host_dir_part = '{}/part_{}'.format(flask_host_dir, part_number) az_outer_dir_part = '{}/part_{}'.format(az_outer_dir, part_number) os.mkdir(flask_host_dir_part) magnet_config_path_part = os.path.join(flask_host_dir_part, "magnet_params.json") with open(magnet_config_path_part, 'w', encoding='utf-8') as f: json.dump(magnet_config, f, ensure_ascii=False, indent=4) per_job_uuid = "{}-{}".format(job_uuid, part_number) JOB_SPEC["metadata"]["name"] = "ship-job-{}".format(per_job_uuid) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(",".join(map(str, magnet_config))) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(str(chunk_size)) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(str(start_event_num)) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(input_file) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(str(config.STEP_GEO)) JOB_SPEC["spec"]["template"]["spec"]["containers"][0][ "command"].append(az_outer_dir_part) # print(JOB_SPEC) job_spec_config_file = os.path.join(flask_host_dir_part, "job_spec.json") with open(job_spec_config_file, 'w', encoding='utf-8') as f: json.dump(JOB_SPEC, f, ensure_ascii=False, indent=4) job = pykube.Job(api, JOB_SPEC) job.create() jobs.append(job) uuids.append(per_job_uuid) result = { 'uuid': uuids, 'container_id': [job.obj['metadata']['name'] for job in jobs], 'container_status': job_status([status_checker(job) for job in jobs]), 'message': None } redis.set(job_uuid, json.dumps(result)) time.sleep(1.) [job.reload() for job in jobs] print(os.listdir(flask_host_dir)) finished = False print([job.obj for job in jobs]) start_time = time.time() failed_jobs = set() try: while not finished: statuses = [] time.sleep(10) for index, job in enumerate(jobs): if index in failed_jobs: statuses.append('failed') continue status = 'wait' try: job.reload() status = status_checker(job=job) if status == "succeeded": print("JOB: {} finished".format(index)) elif status == "wait": job_start_time = parser.parse( job.obj["metadata"]["creationTimestamp"]) dt = (datetime.datetime.now(tz.tzlocal()) - job_start_time).seconds / 60 print("JOB: {}, DT {}:".format(index, dt)) if dt > config.TIME_LIMIT: status = "failed" print("TIME LIMIT per job exceeded, deleting: {}". format(job.obj['metadata']['name'])) job.delete() failed_jobs.add(index) except requests.exceptions.HTTPError as e: # except only internet errors print(e, traceback.print_exc()) statuses.append(status) if 'wait' in statuses: finished = False else: finished = True time.sleep(20) # print(os.listdir(flask_host_dir)) # collect data from succesfully finished jobs optimise_inputs = [] for part_number, job in enumerate(jobs): if part_number in failed_jobs: continue with open( '{}/{}'.format( "{}/part_{}".format(flask_host_dir, part_number), 'job_status.json'), 'w') as j: json.dump(job.obj, j) if status_checker(job=job) == 'succeeded' and \ os.path.exists('{}/{}'.format("{}/part_{}".format(flask_host_dir, part_number), 'optimise_input.json')): with open( '{}/{}'.format( "{}/part_{}".format(flask_host_dir, part_number), 'optimise_input.json'), 'r') as j: optimise_input = json.loads(j.read()) optimise_inputs.append(optimise_input) kinematics = sum([ optimise_input["kinematics"] for optimise_input in optimise_inputs ], []) params = [ optimise_input["params"] for optimise_input in optimise_inputs ] params = params[0] if params else None veto_points = sum([ optimise_input["veto_points"] for optimise_input in optimise_inputs ], []) l = [optimise_input["l"] for optimise_input in optimise_inputs] l = l[0] if l else None w = [optimise_input["w"] for optimise_input in optimise_inputs] w = w[0] if w else None status = job_status([status_checker(job) for job in jobs]) if not optimise_inputs: status = "failed" result = { 'uuid': uuids, 'container_id': [job.obj['metadata']['name'] for job in jobs], 'container_status': status, 'kinematics': kinematics, "params": params, "veto_points": veto_points, "l": l, "w": w, 'message': None } redis.set(job_uuid, json.dumps(result)) except Exception as e: print(e, traceback.print_exc()) result = { 'uuid': uuids, 'container_id': [job.obj['metadata']['name'] for job in jobs], 'container_status': 'failed', 'muons_momentum': None, 'veto_points': None, 'message': traceback.format_exc() } redis.set(job_uuid, json.dumps(result)) # shutil.rmtree(flask_host_dir) # print(os.listdir(flask_host_dir)) return result