def _required_delete_job_arguments(cluster, job): required_cluster_fields = {"name": str} validate_dict_values(cluster, required_cluster_fields, verbose=True, throw=True) required_job_fields = {"meta": dict} validate_dict_values(job, required_job_fields, verbose=True, throw=True) either_meta_fields = {"name": str, "all": str} validate_either_values(job["meta"], either_meta_fields, verbose=True, throw=True)
def _required_get_result_arguments(job, storage, s3): required_job_fields = {"meta": dict} validate_dict_values(job, required_job_fields, verbose=True, throw=True) required_meta_fields = {"name": str} validate_dict_values(job["meta"], required_meta_fields, verbose=True, throw=True) required_storage_fields = {"endpoint": str} validate_dict_values(storage, required_storage_fields, verbose=True, throw=True) validate_dict_values(s3, required_s3_fields, verbose=True, throw=True)
def run(provider, provider_kwargs, cluster=None, job=None, storage=None): # TODO, temp fix s3 = storage["s3"] _validate_fields( provider=provider_kwargs, cluster=cluster, job=job, storage=storage, s3=s3 ) _required_run_arguments(provider_kwargs, cluster, job, storage, s3) response = {"job": {}} if "name" not in job["meta"] or not job["meta"]["name"]: since_epoch = int(time.time()) job["meta"]["name"] = "{}-{}".format(JOB_DEFAULT_NAME, since_epoch) if "bucket_name" not in s3 or not s3["bucket_name"]: s3["bucket_name"] = job["meta"]["name"] container_engine_client = new_client( ContainerEngineClient, composite_class=ContainerEngineClientCompositeOperations, name=provider_kwargs["profile"]["name"], ) compute_cluster = get_cluster_by_name( container_engine_client, provider_kwargs["profile"]["compartment_id"], name=cluster["name"], ) if not compute_cluster: response["msg"] = "Failed to find a cluster with name: {}".format( cluster["name"] ) return False, response refreshed = refresh_kube_config( compute_cluster.id, name=provider_kwargs["profile"]["name"] ) if not refreshed: response["msg"] = "Failed to refresh the kubernetes config" return False, response node_manager = NodeManager() if not node_manager.discover(): response["msg"] = "Failed to discover any nodes to schedule jobs on" return False, response node = node_manager.select() if not node: response["msg"] = "Failed to select a node to schedule on" return False, response # Ensure we have the newest config scheduler = KubenetesScheduler() jobio_args = [ "jobio", "run", ] jobio_args.extend(job["commands"]) jobio_args.extend(["--job-meta-name", job["meta"]["name"]]) if "output_path" in job: jobio_args.extend( ["--job-output-path", job["output_path"],] ) if "capture" in job and job["capture"]: jobio_args.append("--job-capture") if "debug" in job["meta"]: jobio_args.append("--job-meta-debug") if "env_override" in job["meta"]: jobio_args.append("--job-meta-env-override") # Maintained by the pod volumes = [] # Maintained by the container volume_mounts = [] # Environment to pass to the container envs = [] # Prepare config for the scheduler scheduler_config = {} if storage and storage["enable"]: validate_dict_values(storage, required_storage_fields, throw=True) jobio_args.append("--storage-enable") # Means that results should be exported to the specified storage # Create kubernetes secrets core_api = client.CoreV1Api() # storage_api = client.StorageV1Api() # Storage endpoint credentials secret (Tied to a profile and job) secret_profile_name = "{}-{}-{}".format( STORAGE_CREDENTIALS_NAME, s3["name"], job["meta"]["name"] ) try: storage_credentials_secret = core_api.read_namespaced_secret( secret_profile_name, KUBERNETES_NAMESPACE ) except ApiException: storage_credentials_secret = None # volumes secret_volume_source = V1SecretVolumeSource(secret_name=secret_profile_name) secret_volume = V1Volume(name=secret_profile_name, secret=secret_volume_source) volumes.append(secret_volume) # Where the storage credentials should be mounted # in the compute unit secret_mount = V1VolumeMount( name=secret_profile_name, mount_path=storage["credentials_path"], read_only=True, ) volume_mounts.append(secret_mount) if s3: validate_dict_values(s3, required_staging_values, verbose=True, throw=True) jobio_args.append("--storage-s3") # S3 storage # Look for s3 credentials and config files s3_config = load_aws_config( s3["config_file"], s3["credentials_file"], profile_name=s3["name"], ) s3_config["endpoint_url"] = storage["endpoint"] if not storage_credentials_secret: secret_data = dict( aws_access_key_id=s3_config["aws_access_key_id"], aws_secret_access_key=s3_config["aws_secret_access_key"], ) secret_metadata = V1ObjectMeta(name=secret_profile_name) secrets_config = dict(metadata=secret_metadata, string_data=secret_data) scheduler_config.update(dict(secret_kwargs=secrets_config)) # If `access_key` # TODO, unify argument endpoint, with s3 config endpoint' s3_resource = boto3.resource("s3", **s3_config) bucket = bucket_exists(s3_resource.meta.client, s3["bucket_name"]) if not bucket: bucket = s3_resource.create_bucket( Bucket=s3["bucket_name"], CreateBucketConfiguration={ "LocationConstraint": s3_config["region_name"] }, ) if "upload_path" in storage and storage["upload_path"]: # Upload local path to the bucket as designated input for the job uploaded = None if os.path.exists(storage["upload_path"]): if os.path.isdir(storage["upload_path"]): uploaded = upload_directory_to_s3( s3_resource.meta.client, storage["upload_path"], s3["bucket_name"], s3_prefix=s3["bucket_input_prefix"], ) elif os.path.isfile(storage["upload_path"]): s3_path = os.path.basename(storage["upload_path"]) if s3["bucket_input_prefix"]: s3_path = os.path.join(s3["bucket_input_prefix"], s3_path) # Upload uploaded = upload_to_s3( s3_resource.meta.client, storage["upload_path"], s3_path, s3["bucket_name"], ) if not uploaded: response[ "msg" ] = "Failed to local path: {} in the upload folder to s3".format( storage["upload_path"] ) return False, response jobio_args.extend( [ "--s3-region-name", s3_config["region_name"], "--storage-secrets-dir", storage["credentials_path"], "--storage-endpoint", storage["endpoint"], "--storage-input-path", storage["input_path"], "--storage-output-path", storage["output_path"], "--bucket-name", s3["bucket_name"], "--bucket-input-prefix", s3["bucket_input_prefix"], "--bucket-output-prefix", s3["bucket_output_prefix"], ] ) # Provide a way to allow pod specific output prefixes field_ref = client.V1ObjectFieldSelector(field_path="metadata.name") env_var_source = client.V1EnvVarSource(field_ref=field_ref) # HACK, Set the output prefix in the bucket to the name of the pod env_output_prefix = client.V1EnvVar( name="JOBIO_BUCKET_OUTPUT_PREFIX", value_from=env_var_source ) envs.append(env_output_prefix) if scheduler_config: prepared = scheduler.prepare(**scheduler_config) if not prepared: response["msg"] = "Failed to prepare the scheduler" return False, response container_spec = dict( name=job["meta"]["name"], image=cluster["image"], env=envs, args=jobio_args, volume_mounts=volume_mounts, ) # If the working directory does not exist inside the container # It will set permissions where it will be unable to expand the # s3 bucket if the user doesn't have root permissions if "working_dir" in job: container_spec.update({"working_dir": job["working_dir"]}) # If the container requires a specific set of resources resources = {} if "min_cores" in job: resources["requests"] = {"cpu": job["min_cores"]} if "max_cores" in job: resources["limits"] = {"cpu": job["max_cores"]} if "min_memory" in job: resources["requests"].update({"memory": job["min_memory"]}) if "max_memory" in job: resources["limits"].update({"memory": job["max_memory"]}) if resources: resource_req = client.V1ResourceRequirements(**resources) container_spec.update({"resources": resource_req}) # args=jobio_args, pod_spec = dict(node_name=node.metadata.name, volumes=volumes, dns_policy="Default") job_spec = dict( backoff_limit=2, parallelism=job["meta"]["num_parallel"], completions=job["meta"]["num_jobs"], ) task = dict( container_kwargs=container_spec, pod_spec_kwargs=pod_spec, job_spec_kwargs=job_spec, ) job = scheduler.submit(**task) if not job: response["msg"] = "Failed to submit the job" return False, response response["job"] = job response["msg"] = "Job submitted" return True, response
def _required_run_arguments(provider_kwargs, cluster, job, storage, s3): validate_dict_values( provider_kwargs["profile"], valid_profile_config, verbose=True, throw=True ) validate_dict_values(cluster, required_run_cluster_fields, verbose=True, throw=True) validate_dict_values(job, required_run_job_fields, verbose=True, throw=True)
def validate_options(cls, options): if not isinstance(options, dict): raise TypeError("options is not a dictionary") validate_dict_fields( options["profile"], valid_profile_config, verbose=True, throw=True ) validate_dict_values( options["profile"], valid_profile_config, verbose=True, throw=True ) validate_dict_fields( options["cluster"], valid_cluster_config, verbose=True, throw=True ) required_cluster_fields = {"name": str} validate_dict_values( options["cluster"], required_cluster_fields, verbose=True, throw=True ) required_node_fields = { "availability_domain": str, "name": str, "size": int, "node_shape": str, "image": (str, dict), } validate_dict_fields( options["cluster"]["node"], valid_cluster_node_config, verbose=True, throw=True, ) validate_dict_values( options["cluster"]["node"], required_node_fields, verbose=True, throw=True ) required_vcn_fields = {"dns_label": str, "cidr_block": str} validate_dict_fields(options["vcn"], valid_vcn_config, verbose=True, throw=True) validate_dict_values( options["vcn"], required_vcn_fields, verbose=True, throw=True ) required_subnet_fields = {"dns_label": str, "cidr_block": str} validate_dict_fields( options["subnet"], valid_subnet_config, verbose=True, throw=True ) validate_dict_values( options["subnet"], required_subnet_fields, verbose=True, throw=True ) required_internetgateway_fields = {"is_enabled": bool} validate_dict_fields( options["internetgateway"], valid_internet_gateway_config, verbose=True, throw=True, ) validate_dict_values( options["internetgateway"], required_internetgateway_fields, verbose=True, throw=True, ) required_route_table_fields = {"routerules": list} validate_dict_fields( options["routetable"], valid_route_table_config, verbose=True, throw=True ) validate_dict_values( options["routetable"], required_route_table_fields, verbose=True, throw=True ) required_routerules_fields = { "destination": str, "destination_type": str, } # Check each routerule if isinstance(options["routetable"]["routerules"], list): for route_rule in options["routetable"]["routerules"]: validate_dict_fields( route_rule, valid_route_rule_config, verbose=True, throw=True, ) validate_dict_values( route_rule, required_routerules_fields, verbose=True, throw=True, ) else: validate_dict_fields( options["routetable"]["routerules"], valid_route_rule_config, verbose=True, throw=True, ) validate_dict_values( options["routetable"]["routerules"], required_routerules_fields, verbose=True, throw=True, )