def run(provider, provider_kwargs, cluster=None, job=None, storage=None): # TODO, temp fix s3 = storage["s3"] _validate_fields( provider=provider_kwargs, cluster=cluster, job=job, storage=storage, s3=s3 ) _required_run_arguments(provider_kwargs, cluster, job, storage, s3) response = {"job": {}} if "name" not in job["meta"] or not job["meta"]["name"]: since_epoch = int(time.time()) job["meta"]["name"] = "{}-{}".format(JOB_DEFAULT_NAME, since_epoch) if "bucket_name" not in s3 or not s3["bucket_name"]: s3["bucket_name"] = job["meta"]["name"] container_engine_client = new_client( ContainerEngineClient, composite_class=ContainerEngineClientCompositeOperations, name=provider_kwargs["profile"]["name"], ) compute_cluster = get_cluster_by_name( container_engine_client, provider_kwargs["profile"]["compartment_id"], name=cluster["name"], ) if not compute_cluster: response["msg"] = "Failed to find a cluster with name: {}".format( cluster["name"] ) return False, response refreshed = refresh_kube_config( compute_cluster.id, name=provider_kwargs["profile"]["name"] ) if not refreshed: response["msg"] = "Failed to refresh the kubernetes config" return False, response node_manager = NodeManager() if not node_manager.discover(): response["msg"] = "Failed to discover any nodes to schedule jobs on" return False, response node = node_manager.select() if not node: response["msg"] = "Failed to select a node to schedule on" return False, response # Ensure we have the newest config scheduler = KubenetesScheduler() jobio_args = [ "jobio", "run", ] jobio_args.extend(job["commands"]) jobio_args.extend(["--job-meta-name", job["meta"]["name"]]) if "output_path" in job: jobio_args.extend( ["--job-output-path", job["output_path"],] ) if "capture" in job and job["capture"]: jobio_args.append("--job-capture") if "debug" in job["meta"]: jobio_args.append("--job-meta-debug") if "env_override" in job["meta"]: jobio_args.append("--job-meta-env-override") # Maintained by the pod volumes = [] # Maintained by the container volume_mounts = [] # Environment to pass to the container envs = [] # Prepare config for the scheduler scheduler_config = {} if storage and storage["enable"]: validate_dict_values(storage, required_storage_fields, throw=True) jobio_args.append("--storage-enable") # Means that results should be exported to the specified storage # Create kubernetes secrets core_api = client.CoreV1Api() # storage_api = client.StorageV1Api() # Storage endpoint credentials secret (Tied to a profile and job) secret_profile_name = "{}-{}-{}".format( STORAGE_CREDENTIALS_NAME, s3["name"], job["meta"]["name"] ) try: storage_credentials_secret = core_api.read_namespaced_secret( secret_profile_name, KUBERNETES_NAMESPACE ) except ApiException: storage_credentials_secret = None # volumes secret_volume_source = V1SecretVolumeSource(secret_name=secret_profile_name) secret_volume = V1Volume(name=secret_profile_name, secret=secret_volume_source) volumes.append(secret_volume) # Where the storage credentials should be mounted # in the compute unit secret_mount = V1VolumeMount( name=secret_profile_name, mount_path=storage["credentials_path"], read_only=True, ) volume_mounts.append(secret_mount) if s3: validate_dict_values(s3, required_staging_values, verbose=True, throw=True) jobio_args.append("--storage-s3") # S3 storage # Look for s3 credentials and config files s3_config = load_aws_config( s3["config_file"], s3["credentials_file"], profile_name=s3["name"], ) s3_config["endpoint_url"] = storage["endpoint"] if not storage_credentials_secret: secret_data = dict( aws_access_key_id=s3_config["aws_access_key_id"], aws_secret_access_key=s3_config["aws_secret_access_key"], ) secret_metadata = V1ObjectMeta(name=secret_profile_name) secrets_config = dict(metadata=secret_metadata, string_data=secret_data) scheduler_config.update(dict(secret_kwargs=secrets_config)) # If `access_key` # TODO, unify argument endpoint, with s3 config endpoint' s3_resource = boto3.resource("s3", **s3_config) bucket = bucket_exists(s3_resource.meta.client, s3["bucket_name"]) if not bucket: bucket = s3_resource.create_bucket( Bucket=s3["bucket_name"], CreateBucketConfiguration={ "LocationConstraint": s3_config["region_name"] }, ) if "upload_path" in storage and storage["upload_path"]: # Upload local path to the bucket as designated input for the job uploaded = None if os.path.exists(storage["upload_path"]): if os.path.isdir(storage["upload_path"]): uploaded = upload_directory_to_s3( s3_resource.meta.client, storage["upload_path"], s3["bucket_name"], s3_prefix=s3["bucket_input_prefix"], ) elif os.path.isfile(storage["upload_path"]): s3_path = os.path.basename(storage["upload_path"]) if s3["bucket_input_prefix"]: s3_path = os.path.join(s3["bucket_input_prefix"], s3_path) # Upload uploaded = upload_to_s3( s3_resource.meta.client, storage["upload_path"], s3_path, s3["bucket_name"], ) if not uploaded: response[ "msg" ] = "Failed to local path: {} in the upload folder to s3".format( storage["upload_path"] ) return False, response jobio_args.extend( [ "--s3-region-name", s3_config["region_name"], "--storage-secrets-dir", storage["credentials_path"], "--storage-endpoint", storage["endpoint"], "--storage-input-path", storage["input_path"], "--storage-output-path", storage["output_path"], "--bucket-name", s3["bucket_name"], "--bucket-input-prefix", s3["bucket_input_prefix"], "--bucket-output-prefix", s3["bucket_output_prefix"], ] ) # Provide a way to allow pod specific output prefixes field_ref = client.V1ObjectFieldSelector(field_path="metadata.name") env_var_source = client.V1EnvVarSource(field_ref=field_ref) # HACK, Set the output prefix in the bucket to the name of the pod env_output_prefix = client.V1EnvVar( name="JOBIO_BUCKET_OUTPUT_PREFIX", value_from=env_var_source ) envs.append(env_output_prefix) if scheduler_config: prepared = scheduler.prepare(**scheduler_config) if not prepared: response["msg"] = "Failed to prepare the scheduler" return False, response container_spec = dict( name=job["meta"]["name"], image=cluster["image"], env=envs, args=jobio_args, volume_mounts=volume_mounts, ) # If the working directory does not exist inside the container # It will set permissions where it will be unable to expand the # s3 bucket if the user doesn't have root permissions if "working_dir" in job: container_spec.update({"working_dir": job["working_dir"]}) # If the container requires a specific set of resources resources = {} if "min_cores" in job: resources["requests"] = {"cpu": job["min_cores"]} if "max_cores" in job: resources["limits"] = {"cpu": job["max_cores"]} if "min_memory" in job: resources["requests"].update({"memory": job["min_memory"]}) if "max_memory" in job: resources["limits"].update({"memory": job["max_memory"]}) if resources: resource_req = client.V1ResourceRequirements(**resources) container_spec.update({"resources": resource_req}) # args=jobio_args, pod_spec = dict(node_name=node.metadata.name, volumes=volumes, dns_policy="Default") job_spec = dict( backoff_limit=2, parallelism=job["meta"]["num_parallel"], completions=job["meta"]["num_jobs"], ) task = dict( container_kwargs=container_spec, pod_spec_kwargs=pod_spec, job_spec_kwargs=job_spec, ) job = scheduler.submit(**task) if not job: response["msg"] = "Failed to submit the job" return False, response response["job"] = job response["msg"] = "Job submitted" return True, response
def delete_job(provider_kwargs, cluster={}, job={}): _validate_fields(provider=provider_kwargs, job=job, cluster=cluster) _required_delete_job_arguments(cluster, job) response = {} # Ensure we have the newest config container_engine_client = new_client( ContainerEngineClient, composite_class=ContainerEngineClientCompositeOperations, name=provider_kwargs["profile"]["name"], ) compute_cluster = get_cluster_by_name( container_engine_client, provider_kwargs["profile"]["compartment_id"], name=cluster["name"], ) if not compute_cluster: response["msg"] = "Failed to find a cluster with name: {}".format( cluster["name"] ) return False, response refreshed = refresh_kube_config( compute_cluster.id, name=provider_kwargs["profile"]["name"] ) if not refreshed: response["msg"] = "Failed to refresh the kubernetes config" return False, response scheduler = KubenetesScheduler() if "name" in job["meta"] and job["meta"]["name"]: removed = scheduler.remove(job["meta"]["name"]) if removed: response["msg"] = "Removed: {}".format(job["meta"]["name"]) return True, response response["msg"] = "Failed to remove: {}".format(job["meta"]["name"]) return False, response if "all" in job["meta"] and job["meta"]["all"]: jobs = scheduler.list_scheduled() if not jobs: response["msg"] = "Failed to retrieve scheduled jobs" return False, response failed = [] # Kubernetes jobs for job in jobs: removed = scheduler.remove(job["metadata"]["name"]) if not removed: failed.append(job) if failed: response["msg"] = "Failed to remove: {}".format( [job["metadata"]["name"] for job in jobs] ) return False, response response["msg"] = "Removed all jobs" return True, response response["msg"] = "Neither a single name or all jobs were specified to be removed" return False, response
from oci.core import VirtualNetworkClient, VirtualNetworkClientCompositeOperations from corc.providers.oci.helpers import new_client from corc.providers.oci.network import delete_compartment_vcns if __name__ == "__main__": compartment_id = "" network_client = new_client( VirtualNetworkClient, composite_class=VirtualNetworkClientCompositeOperations, name="", ) delected_vcns = delete_compartment_vcns(network_client, compartment_id) print(delected_vcns)
dns_label="xnovotech", ) subnet_options = dict(cidr_block="10.0.1.0/24", display_name="workers", dns_label="workers") options = dict( profile=oci_profile_options, vcn=vcn_options, subnet=subnet_options, ) return options if __name__ == "__main__": options = prepare_options() network_client = new_client( VirtualNetworkClient, composite_class=VirtualNetworkClientCompositeOperations, name=options["oci"]["name"], ) stack = new_vcn_stack( network_client, options["oci"]["compartment_id"], vcn_kwargs=options["vcn"], subnet_kwargs=options["subnet"], )
def setUp(self): # Load compartment_id from the env prefix = ("oci", ) oci_compartment_id = load_from_env_or_config( {"profile": { "compartment_id": {} }}, prefix=gen_config_provider_prefix(prefix), throw=True, ) oci_name = load_from_env_or_config( {"profile": { "name": {} }}, prefix=gen_config_provider_prefix(prefix), throw=True, ) self.oci_profile_options = { "compartment_id": oci_compartment_id, "name": oci_name, } test_name = "Test_VCN" vcn_name = test_name + "_Network" internet_gateway_name = test_name + "_Internet_Gateway" subnet_name = test_name + "_Subnet" # Add unique test postfix test_id = load_from_env_or_config( {"test": { "id": {} }}, prefix=gen_config_provider_prefix(prefix)) if test_id: vcn_name += test_id internet_gateway_name += test_id subnet_name += test_id internet_gateway_options = dict(display_name=internet_gateway_name, is_enabled=True) route_table_options = dict(routerules=[ dict( cidr_block=None, destination="0.0.0.0/0", destination_type="CIDR_BLOCK", ) ]) self.vcn_options = dict( cidr_block="10.0.0.0/16", display_name=vcn_name, dns_label="ku", ) self.subnet_options = dict(display_name=subnet_name, dns_label="workers") self.options = dict( profile=self.oci_profile_options, vcn=self.vcn_options, internetgateway=internet_gateway_options, routetable=route_table_options, subnet=self.subnet_options, ) self.network_client = new_client( VirtualNetworkClient, composite_class=VirtualNetworkClientCompositeOperations, name=self.options["profile"]["name"], )
def setUp(self): # Load compartment_id from the env prefix = ("oci", ) oci_compartment_id = load_from_env_or_config( {"profile": { "compartment_id": {} }}, prefix=gen_config_provider_prefix(prefix), throw=True, ) oci_name = load_from_env_or_config( {"profile": { "name": {} }}, prefix=gen_config_provider_prefix(prefix), throw=True, ) oci_profile_options = { "compartment_id": oci_compartment_id, "name": oci_name } test_name = "Test_Cluster" cluster_name = test_name node_name = test_name + "_Node" vcn_name = test_name + "_Network" internet_gateway_name = test_name + "_Internet_Gateway" subnet_name = test_name + "_Subnet" # Add unique test postfix test_id = load_from_env_or_config( {"test": { "id": {} }}, prefix=gen_config_provider_prefix(prefix)) if test_id: cluster_name += test_id node_name += test_id vcn_name += test_id internet_gateway_name += test_id subnet_name += test_id # Sort order in ascending to ensure that complex images # such as GPU powered shapes are not selected. # These are typically not supported by the cluster image_options = dict( operating_system="Oracle Linux", operating_system_version="7.8", limit="1", sort_order="ASC", ) node_options = dict( availability_domain="lfcb:EU-FRANKFURT-1-AD-1", name=node_name, size=1, node_shape="VM.Standard1.1", image=image_options, ) internet_gateway_options = dict(display_name=internet_gateway_name, is_enabled=True) route_table_options = dict(routerules=[ dict( cidr_block=None, destination="0.0.0.0/0", destination_type="CIDR_BLOCK", ) ]) vcn_options = dict( cidr_block="10.0.0.0/16", display_name=vcn_name, dns_label="ku", ) subnet_options = dict(cidr_block="10.0.2.0/24", display_name=subnet_name, dns_label="workers") self.container_engine_client = new_client( ContainerEngineClient, composite_class=ContainerEngineClientCompositeOperations, name=oci_name, ) cluster_options = dict( name=cluster_name, kubernetes_version=get_kubernetes_version( self.container_engine_client), node=node_options, ) self.compute_client = new_client( ComputeClient, composite_class=ComputeClientCompositeOperations, name=oci_name, ) self.network_client = new_client( VirtualNetworkClient, composite_class=VirtualNetworkClientCompositeOperations, name=oci_name, ) self.options = dict( profile=oci_profile_options, cluster=cluster_options, vcn=vcn_options, internetgateway=internet_gateway_options, routetable=route_table_options, subnet=subnet_options, )