Esempio n. 1
0
def run_kube_job(job_spec: dict, envs: dict, job_folder: str,
                 timeout: int) -> str:
    job_tag = "-".join(job_folder.split("/")[-2:])
    job_uuid: str = f"ek-{str(uuid.uuid4())[:5]}-{job_tag}"
    job_spec["metadata"]["name"] = job_spec["metadata"]["name"].format(
        job_uuid)

    # DEPRECATED FOR USAGE WITH AZCOPY
    # job_spec["spec"]["template"]["spec"]["volumes"][0]["hostPath"]["path"] = job_folder

    job_spec["spec"]["template"]["spec"]["containers"][0]["env"] = to_kube_env(
        envs)

    job = pykube.Job(api, job_spec)
    job.create()
    start = datetime.datetime.now()
    status = "start"
    logging.info(f"JOB: {job_uuid} was started. Tag is {job_tag}")
    while (datetime.datetime.now() - start).seconds < timeout:
        try:
            time.sleep(10)
            job.reload()
            status = status_checker(job=job)
            if status == "succeeded":
                logging.info(
                    f"JOB: {job_uuid} finished. Output in {job_folder}")
                job.delete("Foreground")
                return status
        except requests.exceptions.HTTPError as exc:
            print(f"{exc} {traceback.print_exc()}")
    print(f"Timeout {timeout} was exceeded. Deleting the job {job_uuid}")
    job.delete("Foreground")
    return status
Esempio n. 2
0
def create_job(job_id, docker_img, cmd, volumes, env_vars, namespace):
    job = {
        'kind': 'Job',
        'apiVersion': 'batch/v1',
        'metadata': {
            'name': job_id,
            'namespace': namespace
        },
        'spec': {
            'autoSelector': True,
            'template': {
                'metadata': {
                    'name': job_id
                },
                'spec': {
                    'containers': [
                        {
                            'name': job_id,
                            'image': docker_img
                        },
                    ],
                    'restartPolicy': 'OnFailure'
                }
            }
        }
    }

    import shlex
    if cmd:
        (job['spec']['template']['spec']['containers'][0]['command']
         ) = shlex.split(cmd)

    if env_vars:
        job['spec']['template']['spec']['containers'][0]['env'] = []
        for var, value in env_vars.items():
            job['spec']['template']['spec']['containers'][0]['env'].append({
                'name':
                var,
                'value':
                value
            })

    if volumes:
        job['spec']['template']['spec']['containers'][0]['volumeMounts'] = []
        job['spec']['template']['spec']['volumes'] = []
        for volume, mount_path in volumes:
            (job['spec']['template']['spec']['containers'][0]
             ['volumeMounts'].append({
                 'name': volume['name'],
                 'mountPath': mount_path
             }))
            job['spec']['template']['spec']['volumes'].append(volume)

    # add better handling
    try:
        job_obj = pykube.Job(api, job)
        job_obj.create()
        return job_obj
    except pykube.exceptions.HTTPError:
        return None
Esempio n. 3
0
def run_caw(data="/work/apps/pipeline_test/data/",name="cawcl",pdName="caw",memory="5Gi",cpu="2",threads="2"):
    api = pykube.HTTPClient(pykube.KubeConfig.from_url("http://localhost:8001"))

    obj = {
      "apiVersion": "batch/v1",
      "kind": "Job",
      "metadata": {
        "name": name
      },
      "spec": {
        "template": {
          "metadata": {
            "name": "caw"
          },
          "spec": {
            "volumes": [
              {
                "name": "myapp-persistent-job",
                "gcePersistentDisk": {
                  "pdName": pdName,
                  "fsType": "ext4"
                }
              }
            ],
            "containers": [
              {
                "name": "caw",
                "image": "firog/ubuntujava",
                "command": [
                  "bash",
                  "/work/apps/pipeline_test/flexible_location_pipeline.sh",
                  "/work/apps/pipeline_test/scratch/",
                  "/work/apps/",
                  "/work/apps/pipeline_test/ref/",
                  data,
                  threads
                ],
                "resources": {
                  "requests": {
                    "memory": memory,
                    "cpu": cpu
                  }
                },
                "volumeMounts": [
                  {
                    "name": "myapp-persistent-job",
                    "mountPath": "/work"
                  }
                ]
              }
            ],
            "restartPolicy": "Never"
          }
        }
      }
    }

    pykube.Job(api, obj).create()
Esempio n. 4
0
    def create_job_with_message(self, input_blob_loc, output_blob_loc,
                                output_file_name):
        spec_file = os.getcwd() + '/workloads/' + os.environ.get('WORKLOAD')

        with open(spec_file, 'r') as spec:
            lines = spec.read()

        templated = self.inject_env(input_blob_loc, output_blob_loc,
                                    output_file_name, lines)

        try:
            json_spec = json.loads(templated)
        except Exception:
            print("failed to read json spec")
            return

        pykube.Job(self.api, json_spec).create()
Esempio n. 5
0
def launch_k8s():
  '''
  Launch a k8s job as specified by the above environment variables.
  '''
  kube_config = {
    "name": os.environ["kube_name"],
    "server": os.environ["kube_server"],
    "certificate-authority-data": os.environ["kube_cert_auth"].rstrip(),
    "user": os.environ["kube_user"],
    "pass": os.environ["kube_pass"]
  }

  kube_doc = """
apiVersion: v1
kind: Config
preferences: {{}}
clusters:
- cluster:
    certificate-authority-data: {certificate-authority-data}
    server: {server}
  name: {name}
contexts:
- context:
    cluster: {name}
    user: {name}
  name: {name}
current-context: {name}
users:
- name: {name}
  user:
    username: {user}
    password: {pass}
""".format(**kube_config)

  with open('/tmp/kube_config', 'w') as kube_config_file:
    kube_config_file.write(kube_doc)
    kube_config_file.close()

  config = pykube.KubeConfig.from_file("/tmp/kube_config")
  api = pykube.HTTPClient(config)

  cmd_prefix = 'kube_cmd_'
  cmd_array = [ v for k, v in sorted(os.environ.items()) \
                if k.startswith(cmd_prefix) and len(k) > len(cmd_prefix)]

  default_env = [
    {"name": "AWS_ACCESS_KEY_ID",     "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "access_key_id"     } } },
    {"name": "AWS_SECRET_ACCESS_KEY", "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "secret_access_key" } } },
    {"name": "AWS_DEFAULT_REGION",    "valueFrom": { "secretKeyRef": { "name": "aws-secrets", "key": "default_region"    } } }
  ]

  env_forward_prefix = 'k8s_job_'
  forward_env = [ {'name': k[len(env_forward_prefix):], 'value' : v} \
                for k,v in os.environ.items() \
                  if k.startswith(env_forward_prefix) and len(k) > len(env_forward_prefix) ]

  job_env = default_env + forward_env

  job_name = os.environ["kube_job_name"]
  job_container = {
    "name"  : job_name,
    "image" : os.environ["kube_image"],
    "env"   : job_env
  }

  if len(cmd_array) > 0:
    job_container['command'] = cmd_array

  job_spec = {
    "apiVersion": "batch/v1",
    "kind": "Job",
    "metadata": {
      "name": job_name
    },
    "spec": {
      "template": {
        "metadata": {
          "name": job_name
        },
        "spec": {
          "containers": [job_container],
          "restartPolicy": "Never"
        }
      }
    }
  }

  job = pykube.Job(api, job_spec)
  if job.exists():
    # Refresh the job execution metadata.
    checkJob = copy.deepcopy(job)
    checkJob.reload()

    reloadJob = False
    print('Current {} job status : {}'.format(job_name, json.dumps(checkJob.obj['status'])))

    if 'active' in checkJob.obj['status']:
      # jobStart = dateutil.parser.parse(j.obj['status']['startTime']).replace(tzinfo=None)
      # jobExpiry = datetime.utcnow() - timedelta(minutes=expiryMinutes)
      # reloadJob = jobStart <= jobExpiry:
      reloadJob = False
    else:
      reloadJob = True

    if reloadJob:
      print('Reloading {} job'.format(job_name))

      # Clean up pods, leaving at most 10 stale containers on k8s.
      pods = pykube.Pod.objects(api).filter(namespace="default", selector={"job-name": job_name})
      if len(pods) > 10:
        sortedPods = sorted(list(pods), key=lambda pod: pod.obj['status']['startTime'])
        while len(sortedPods) > 10:
          pod = sortedPods.pop(0)
          print('Deleting {} pod {}'.format(job_name, pod.name))
          pod.delete()

      job.delete()
      job.create()

  else:
    print('Creating {} job'.format(job_name))
    job.create()
Esempio n. 6
0
def handler(event, context):
    kube_config = {
        "name": os.environ["kube_name"],
        "server": os.environ["kube_server"],
        "certificate-authority-data": os.environ["kube_cert_auth"].rstrip(),
        "user": os.environ["kube_user"],
        "pass": os.environ["kube_pass"]
    }

    kube_doc = """
apiVersion: v1
kind: Config
preferences: {{}}
clusters:
- cluster:
    certificate-authority-data: {certificate-authority-data}
    server: {server}
  name: {name}
contexts:
- context:
    cluster: {name}
    user: {name}
  name: {name}
current-context: {name}
users:
- name: {name}
  user:
    username: {user}
    password: {pass}
""".format(**kube_config)

    with open('/tmp/kube_config', 'w') as kube_config_file:
        kube_config_file.write(kube_doc)
        kube_config_file.close()

    config = pykube.KubeConfig.from_file("/tmp/kube_config")
    api = pykube.HTTPClient(config)

    cmd_prefix = 'kube_cmd_'
    cmd_array = [ v for k, v in sorted(os.environ.items()) \
                  if k.startswith(cmd_prefix) and len(k) > len(cmd_prefix)]

    default_env = [{
        "name": "AWS_ACCESS_KEY_ID",
        "valueFrom": {
            "secretKeyRef": {
                "name": "aws-secrets",
                "key": "access_key_id"
            }
        }
    }, {
        "name": "AWS_SECRET_ACCESS_KEY",
        "valueFrom": {
            "secretKeyRef": {
                "name": "aws-secrets",
                "key": "secret_access_key"
            }
        }
    }, {
        "name": "AWS_DEFAULT_REGION",
        "valueFrom": {
            "secretKeyRef": {
                "name": "aws-secrets",
                "key": "default_region"
            }
        }
    }]

    env_forward_prefix = 'k8s_job_'
    forward_env = [ {'name': k[len(env_forward_prefix):], 'value' : v} \
                  for k,v in os.environ.items() \
                    if k.startswith(env_forward_prefix) and len(k) > len(env_forward_prefix) ]

    job_env = default_env + forward_env

    job_name = os.environ["kube_job_name"]
    job_container = {
        "name": job_name,
        "image": os.environ["kube_image"],
        "env": job_env,
    }

    if "kube_privileged" in os.environ:
        p_mode = os.environ['kube_privileged']
        print('Job {} privileged mode: {}'.format(job_name, p_mode))
        job_container['securityContext'] = {'privileged': bool(p_mode)}

    job_requests = {}
    if "kube_cpu_requests" in os.environ:
        cpu_resources = os.environ['kube_cpu_requests']
        job_requests['cpu'] = str(cpu_resources)

    if "kube_mem_requests" in os.environ:
        mem_resources = os.environ['kube_mem_requests']
        job_requests['memory'] = str(mem_resources)

    if job_requests:
        job_container['resources'] = {'requests': job_requests}
        print('Job {} resources: {}'.format(
            job_name, str(json.dumps(job_container['resources']))))

    if len(cmd_array) > 0:
        job_container['command'] = cmd_array

    pod_spec = {"containers": [job_container], "restartPolicy": "Never"}

    if "kube_nodegroup" in os.environ and os.environ[
            'kube_nodegroup'] is not None:
        node_group = os.environ['kube_nodegroup']
        print('Job {} nodeGroup: {}'.format(job_name, node_group))
        pod_spec['nodeSelector'] = {'opsdx_nodegroup': node_group}

    batch_job_spec = {
        "template": {
            "metadata": {
                "name": job_name
            },
            "spec": pod_spec
        }
    }

    if "kube_active_deadline_seconds" in os.environ:
        try:
            deadline = int(os.environ['kube_active_deadline_seconds'])
            print('Job {} activeDeadlineSeconds: {}'.format(
                job_name, deadline))
            batch_job_spec['activeDeadlineSeconds'] = deadline
        except ValueError:
            print(
                'Invalid kube_active_deadline_seconds environment variable, skipping deadline.'
            )

    job_spec = {
        "apiVersion": "batch/v1",
        "kind": "Job",
        "metadata": {
            "name": job_name
        },
        "spec": batch_job_spec
    }

    print("job spec:")
    print(job_spec)

    job = pykube.Job(api, job_spec)
    if job.exists():
        # Refresh the job execution metadata.
        checkJob = copy.deepcopy(job)
        checkJob.reload()

        reloadJob = False
        print('Current {} job status : {}'.format(
            job_name, json.dumps(checkJob.obj['status'])))

        if 'active' in checkJob.obj['status']:
            # jobStart = dateutil.parser.parse(j.obj['status']['startTime']).replace(tzinfo=None)
            # jobExpiry = datetime.utcnow() - timedelta(minutes=expiryMinutes)
            # reloadJob = jobStart <= jobExpiry:
            reloadJob = False
        else:
            reloadJob = True

        if reloadJob:
            print('Reloading {} job'.format(job_name))

            # Clean up pods, leaving at most 10 stale containers on k8s.
            pods = pykube.Pod.objects(api).filter(
                namespace="default", selector={"job-name": job_name})
            if len(pods) > 10:
                sortedPods = sorted(
                    list(pods), key=lambda pod: pod.obj['status']['startTime'])
                while len(sortedPods) > 10:
                    pod = sortedPods.pop(0)
                    print('Deleting {} pod {}'.format(job_name, pod.name))
                    pod.delete()

            job.delete()
            job.create()

    else:
        print('Creating {} job'.format(job_name))
        job.create()
Esempio n. 7
0
 def create_job(self, file_name):
     with open(file_name) as stream:
         content = yaml.load(stream.read())
     pykube.Job(self.api, content).create()
Esempio n. 8
0
def run_simulation(magnet_config, job_uuid, n_jobs, n_events, input_file,
                   const_field):
    # make random directory for ship docker
    # to store input files and output files
    input_dir = 'input_dir_{}'.format(job_uuid)
    flask_host_dir = '{}/{}'.format(config.FLASK_CONTAINER_DIRECTORY,
                                    input_dir)
    flask_host_dir = os.path.abspath(flask_host_dir)
    # host_outer_dir = '{}/{}'.format(config.HOST_DIRECTORY, input_dir)
    az_outer_dir = '{}/{}'.format(config.AZ_DIRECTORY, input_dir)
    os.mkdir(flask_host_dir)
    sys.stdout = open(os.path.join(flask_host_dir, "out.txt"),
                      "w",
                      buffering=1)
    sys.stderr = open(os.path.join(flask_host_dir, "err.txt"),
                      "w",
                      buffering=1)

    # save magnet config for ship
    # in host directory
    magnet_config_path = os.path.join(flask_host_dir, "magnet_params.json")
    with open(magnet_config_path, 'w', encoding='utf-8') as f:
        json.dump(magnet_config, f, ensure_ascii=False, indent=4)
    result = {
        'uuid': None,
        'container_id': None,
        'container_status': 'starting',
        'message': None
    }
    redis.set(job_uuid, json.dumps(result))

    job_spec_config_file = os.path.join(flask_host_dir, "job_spec.json")
    jobs = []
    uuids = []

    N_EVENTS = config.EVENTS_TOTAL if n_events is None else min(
        n_events, config.EVENTS_TOTAL)
    chunk_size = N_EVENTS // n_jobs
    for part_number in range(n_jobs):
        start_event_num = chunk_size * part_number
        if part_number + 1 == n_jobs:
            chunk_size += N_EVENTS % n_jobs - 1
        JOB_SPEC = deepcopy(config.JOB_SPEC)
        flask_host_dir_part = '{}/part_{}'.format(flask_host_dir, part_number)
        az_outer_dir_part = '{}/part_{}'.format(az_outer_dir, part_number)
        os.mkdir(flask_host_dir_part)
        magnet_config_path_part = os.path.join(flask_host_dir_part,
                                               "magnet_params.json")
        with open(magnet_config_path_part, 'w', encoding='utf-8') as f:
            json.dump(magnet_config, f, ensure_ascii=False, indent=4)
        per_job_uuid = "{}-{}".format(job_uuid, part_number)
        JOB_SPEC["metadata"]["name"] = "ship-job-{}".format(per_job_uuid)
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(",".join(map(str, magnet_config)))
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(str(chunk_size))
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(str(start_event_num))
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(input_file)
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(str(config.STEP_GEO))
        JOB_SPEC["spec"]["template"]["spec"]["containers"][0][
            "command"].append(az_outer_dir_part)
        # print(JOB_SPEC)
        job_spec_config_file = os.path.join(flask_host_dir_part,
                                            "job_spec.json")
        with open(job_spec_config_file, 'w', encoding='utf-8') as f:
            json.dump(JOB_SPEC, f, ensure_ascii=False, indent=4)
        job = pykube.Job(api, JOB_SPEC)
        job.create()
        jobs.append(job)
        uuids.append(per_job_uuid)

    result = {
        'uuid': uuids,
        'container_id': [job.obj['metadata']['name'] for job in jobs],
        'container_status': job_status([status_checker(job) for job in jobs]),
        'message': None
    }
    redis.set(job_uuid, json.dumps(result))
    time.sleep(1.)
    [job.reload() for job in jobs]
    print(os.listdir(flask_host_dir))
    finished = False
    print([job.obj for job in jobs])
    start_time = time.time()
    failed_jobs = set()
    try:
        while not finished:
            statuses = []
            time.sleep(10)
            for index, job in enumerate(jobs):
                if index in failed_jobs:
                    statuses.append('failed')
                    continue
                status = 'wait'
                try:
                    job.reload()
                    status = status_checker(job=job)
                    if status == "succeeded":
                        print("JOB: {} finished".format(index))
                    elif status == "wait":
                        job_start_time = parser.parse(
                            job.obj["metadata"]["creationTimestamp"])
                        dt = (datetime.datetime.now(tz.tzlocal()) -
                              job_start_time).seconds / 60
                        print("JOB: {}, DT {}:".format(index, dt))
                        if dt > config.TIME_LIMIT:
                            status = "failed"
                            print("TIME LIMIT per job exceeded, deleting: {}".
                                  format(job.obj['metadata']['name']))
                            job.delete()
                            failed_jobs.add(index)
                except requests.exceptions.HTTPError as e:
                    # except only internet errors
                    print(e, traceback.print_exc())
                statuses.append(status)
            if 'wait' in statuses:
                finished = False
            else:
                finished = True
        time.sleep(20)
        # print(os.listdir(flask_host_dir))

        # collect data from succesfully finished jobs
        optimise_inputs = []
        for part_number, job in enumerate(jobs):
            if part_number in failed_jobs:
                continue
            with open(
                    '{}/{}'.format(
                        "{}/part_{}".format(flask_host_dir, part_number),
                        'job_status.json'), 'w') as j:
                json.dump(job.obj, j)
            if status_checker(job=job) == 'succeeded' and \
               os.path.exists('{}/{}'.format("{}/part_{}".format(flask_host_dir, part_number), 'optimise_input.json')):
                with open(
                        '{}/{}'.format(
                            "{}/part_{}".format(flask_host_dir, part_number),
                            'optimise_input.json'), 'r') as j:
                    optimise_input = json.loads(j.read())
                optimise_inputs.append(optimise_input)

        kinematics = sum([
            optimise_input["kinematics"] for optimise_input in optimise_inputs
        ], [])
        params = [
            optimise_input["params"] for optimise_input in optimise_inputs
        ]
        params = params[0] if params else None
        veto_points = sum([
            optimise_input["veto_points"] for optimise_input in optimise_inputs
        ], [])
        l = [optimise_input["l"] for optimise_input in optimise_inputs]
        l = l[0] if l else None
        w = [optimise_input["w"] for optimise_input in optimise_inputs]
        w = w[0] if w else None

        status = job_status([status_checker(job) for job in jobs])
        if not optimise_inputs:
            status = "failed"
        result = {
            'uuid': uuids,
            'container_id': [job.obj['metadata']['name'] for job in jobs],
            'container_status': status,
            'kinematics': kinematics,
            "params": params,
            "veto_points": veto_points,
            "l": l,
            "w": w,
            'message': None
        }
        redis.set(job_uuid, json.dumps(result))

    except Exception as e:
        print(e, traceback.print_exc())
        result = {
            'uuid': uuids,
            'container_id': [job.obj['metadata']['name'] for job in jobs],
            'container_status': 'failed',
            'muons_momentum': None,
            'veto_points': None,
            'message': traceback.format_exc()
        }
        redis.set(job_uuid, json.dumps(result))
    # shutil.rmtree(flask_host_dir)
    # print(os.listdir(flask_host_dir))
    return result