Esempio n. 1
0
File: job.py Progetto: rasmunk/corc
def run(provider, provider_kwargs, cluster=None, job=None, storage=None):
    # TODO, temp fix
    s3 = storage["s3"]
    _validate_fields(
        provider=provider_kwargs, cluster=cluster, job=job, storage=storage, s3=s3
    )
    _required_run_arguments(provider_kwargs, cluster, job, storage, s3)

    response = {"job": {}}
    if "name" not in job["meta"] or not job["meta"]["name"]:
        since_epoch = int(time.time())
        job["meta"]["name"] = "{}-{}".format(JOB_DEFAULT_NAME, since_epoch)

    if "bucket_name" not in s3 or not s3["bucket_name"]:
        s3["bucket_name"] = job["meta"]["name"]

    container_engine_client = new_client(
        ContainerEngineClient,
        composite_class=ContainerEngineClientCompositeOperations,
        name=provider_kwargs["profile"]["name"],
    )

    compute_cluster = get_cluster_by_name(
        container_engine_client,
        provider_kwargs["profile"]["compartment_id"],
        name=cluster["name"],
    )

    if not compute_cluster:
        response["msg"] = "Failed to find a cluster with name: {}".format(
            cluster["name"]
        )
        return False, response

    refreshed = refresh_kube_config(
        compute_cluster.id, name=provider_kwargs["profile"]["name"]
    )
    if not refreshed:
        response["msg"] = "Failed to refresh the kubernetes config"
        return False, response

    node_manager = NodeManager()
    if not node_manager.discover():
        response["msg"] = "Failed to discover any nodes to schedule jobs on"
        return False, response

    node = node_manager.select()
    if not node:
        response["msg"] = "Failed to select a node to schedule on"
        return False, response

    # Ensure we have the newest config
    scheduler = KubenetesScheduler()

    jobio_args = [
        "jobio",
        "run",
    ]
    jobio_args.extend(job["commands"])
    jobio_args.extend(["--job-meta-name", job["meta"]["name"]])

    if "output_path" in job:
        jobio_args.extend(
            ["--job-output-path", job["output_path"],]
        )

    if "capture" in job and job["capture"]:
        jobio_args.append("--job-capture")

    if "debug" in job["meta"]:
        jobio_args.append("--job-meta-debug")

    if "env_override" in job["meta"]:
        jobio_args.append("--job-meta-env-override")

    # Maintained by the pod
    volumes = []
    # Maintained by the container
    volume_mounts = []
    # Environment to pass to the container
    envs = []

    # Prepare config for the scheduler
    scheduler_config = {}

    if storage and storage["enable"]:
        validate_dict_values(storage, required_storage_fields, throw=True)
        jobio_args.append("--storage-enable")

        # Means that results should be exported to the specified storage
        # Create kubernetes secrets
        core_api = client.CoreV1Api()
        # storage_api = client.StorageV1Api()

        # Storage endpoint credentials secret (Tied to a profile and job)
        secret_profile_name = "{}-{}-{}".format(
            STORAGE_CREDENTIALS_NAME, s3["name"], job["meta"]["name"]
        )
        try:
            storage_credentials_secret = core_api.read_namespaced_secret(
                secret_profile_name, KUBERNETES_NAMESPACE
            )
        except ApiException:
            storage_credentials_secret = None

        # volumes
        secret_volume_source = V1SecretVolumeSource(secret_name=secret_profile_name)
        secret_volume = V1Volume(name=secret_profile_name, secret=secret_volume_source)
        volumes.append(secret_volume)

        # Where the storage credentials should be mounted
        # in the compute unit
        secret_mount = V1VolumeMount(
            name=secret_profile_name,
            mount_path=storage["credentials_path"],
            read_only=True,
        )
        volume_mounts.append(secret_mount)

        if s3:
            validate_dict_values(s3, required_staging_values, verbose=True, throw=True)
            jobio_args.append("--storage-s3")
            # S3 storage
            # Look for s3 credentials and config files
            s3_config = load_aws_config(
                s3["config_file"], s3["credentials_file"], profile_name=s3["name"],
            )
            s3_config["endpoint_url"] = storage["endpoint"]

            if not storage_credentials_secret:
                secret_data = dict(
                    aws_access_key_id=s3_config["aws_access_key_id"],
                    aws_secret_access_key=s3_config["aws_secret_access_key"],
                )
                secret_metadata = V1ObjectMeta(name=secret_profile_name)
                secrets_config = dict(metadata=secret_metadata, string_data=secret_data)
                scheduler_config.update(dict(secret_kwargs=secrets_config))

            # If `access_key`
            # TODO, unify argument endpoint, with s3 config endpoint'
            s3_resource = boto3.resource("s3", **s3_config)

            bucket = bucket_exists(s3_resource.meta.client, s3["bucket_name"])
            if not bucket:
                bucket = s3_resource.create_bucket(
                    Bucket=s3["bucket_name"],
                    CreateBucketConfiguration={
                        "LocationConstraint": s3_config["region_name"]
                    },
                )

            if "upload_path" in storage and storage["upload_path"]:
                # Upload local path to the bucket as designated input for the job
                uploaded = None
                if os.path.exists(storage["upload_path"]):
                    if os.path.isdir(storage["upload_path"]):
                        uploaded = upload_directory_to_s3(
                            s3_resource.meta.client,
                            storage["upload_path"],
                            s3["bucket_name"],
                            s3_prefix=s3["bucket_input_prefix"],
                        )
                    elif os.path.isfile(storage["upload_path"]):
                        s3_path = os.path.basename(storage["upload_path"])
                        if s3["bucket_input_prefix"]:
                            s3_path = os.path.join(s3["bucket_input_prefix"], s3_path)
                        # Upload
                        uploaded = upload_to_s3(
                            s3_resource.meta.client,
                            storage["upload_path"],
                            s3_path,
                            s3["bucket_name"],
                        )

                if not uploaded:
                    response[
                        "msg"
                    ] = "Failed to local path: {} in the upload folder to s3".format(
                        storage["upload_path"]
                    )
                    return False, response

            jobio_args.extend(
                [
                    "--s3-region-name",
                    s3_config["region_name"],
                    "--storage-secrets-dir",
                    storage["credentials_path"],
                    "--storage-endpoint",
                    storage["endpoint"],
                    "--storage-input-path",
                    storage["input_path"],
                    "--storage-output-path",
                    storage["output_path"],
                    "--bucket-name",
                    s3["bucket_name"],
                    "--bucket-input-prefix",
                    s3["bucket_input_prefix"],
                    "--bucket-output-prefix",
                    s3["bucket_output_prefix"],
                ]
            )

            # Provide a way to allow pod specific output prefixes
            field_ref = client.V1ObjectFieldSelector(field_path="metadata.name")
            env_var_source = client.V1EnvVarSource(field_ref=field_ref)
            # HACK, Set the output prefix in the bucket to the name of the pod
            env_output_prefix = client.V1EnvVar(
                name="JOBIO_BUCKET_OUTPUT_PREFIX", value_from=env_var_source
            )
            envs.append(env_output_prefix)

    if scheduler_config:
        prepared = scheduler.prepare(**scheduler_config)
        if not prepared:
            response["msg"] = "Failed to prepare the scheduler"
            return False, response

    container_spec = dict(
        name=job["meta"]["name"],
        image=cluster["image"],
        env=envs,
        args=jobio_args,
        volume_mounts=volume_mounts,
    )

    # If the working directory does not exist inside the container
    # It will set permissions where it will be unable to expand the
    # s3 bucket if the user doesn't have root permissions
    if "working_dir" in job:
        container_spec.update({"working_dir": job["working_dir"]})

    # If the container requires a specific set of resources
    resources = {}
    if "min_cores" in job:
        resources["requests"] = {"cpu": job["min_cores"]}
    if "max_cores" in job:
        resources["limits"] = {"cpu": job["max_cores"]}
    if "min_memory" in job:
        resources["requests"].update({"memory": job["min_memory"]})
    if "max_memory" in job:
        resources["limits"].update({"memory": job["max_memory"]})

    if resources:
        resource_req = client.V1ResourceRequirements(**resources)
        container_spec.update({"resources": resource_req})

    # args=jobio_args,
    pod_spec = dict(node_name=node.metadata.name, volumes=volumes, dns_policy="Default")

    job_spec = dict(
        backoff_limit=2,
        parallelism=job["meta"]["num_parallel"],
        completions=job["meta"]["num_jobs"],
    )

    task = dict(
        container_kwargs=container_spec,
        pod_spec_kwargs=pod_spec,
        job_spec_kwargs=job_spec,
    )

    job = scheduler.submit(**task)
    if not job:
        response["msg"] = "Failed to submit the job"
        return False, response

    response["job"] = job
    response["msg"] = "Job submitted"
    return True, response
Esempio n. 2
0
File: job.py Progetto: rasmunk/corc
def delete_job(provider_kwargs, cluster={}, job={}):
    _validate_fields(provider=provider_kwargs, job=job, cluster=cluster)
    _required_delete_job_arguments(cluster, job)

    response = {}
    # Ensure we have the newest config
    container_engine_client = new_client(
        ContainerEngineClient,
        composite_class=ContainerEngineClientCompositeOperations,
        name=provider_kwargs["profile"]["name"],
    )

    compute_cluster = get_cluster_by_name(
        container_engine_client,
        provider_kwargs["profile"]["compartment_id"],
        name=cluster["name"],
    )

    if not compute_cluster:
        response["msg"] = "Failed to find a cluster with name: {}".format(
            cluster["name"]
        )
        return False, response

    refreshed = refresh_kube_config(
        compute_cluster.id, name=provider_kwargs["profile"]["name"]
    )
    if not refreshed:
        response["msg"] = "Failed to refresh the kubernetes config"
        return False, response

    scheduler = KubenetesScheduler()

    if "name" in job["meta"] and job["meta"]["name"]:
        removed = scheduler.remove(job["meta"]["name"])
        if removed:
            response["msg"] = "Removed: {}".format(job["meta"]["name"])
            return True, response

        response["msg"] = "Failed to remove: {}".format(job["meta"]["name"])
        return False, response

    if "all" in job["meta"] and job["meta"]["all"]:
        jobs = scheduler.list_scheduled()

        if not jobs:
            response["msg"] = "Failed to retrieve scheduled jobs"
            return False, response

        failed = []
        # Kubernetes jobs
        for job in jobs:
            removed = scheduler.remove(job["metadata"]["name"])
            if not removed:
                failed.append(job)

        if failed:
            response["msg"] = "Failed to remove: {}".format(
                [job["metadata"]["name"] for job in jobs]
            )
            return False, response

        response["msg"] = "Removed all jobs"
        return True, response

    response["msg"] = "Neither a single name or all jobs were specified to be removed"
    return False, response
Esempio n. 3
0
from oci.core import VirtualNetworkClient, VirtualNetworkClientCompositeOperations
from corc.providers.oci.helpers import new_client
from corc.providers.oci.network import delete_compartment_vcns

if __name__ == "__main__":
    compartment_id = ""
    network_client = new_client(
        VirtualNetworkClient,
        composite_class=VirtualNetworkClientCompositeOperations,
        name="",
    )

    delected_vcns = delete_compartment_vcns(network_client, compartment_id)
    print(delected_vcns)
Esempio n. 4
0
        dns_label="xnovotech",
    )

    subnet_options = dict(cidr_block="10.0.1.0/24",
                          display_name="workers",
                          dns_label="workers")

    options = dict(
        profile=oci_profile_options,
        vcn=vcn_options,
        subnet=subnet_options,
    )
    return options


if __name__ == "__main__":
    options = prepare_options()

    network_client = new_client(
        VirtualNetworkClient,
        composite_class=VirtualNetworkClientCompositeOperations,
        name=options["oci"]["name"],
    )

    stack = new_vcn_stack(
        network_client,
        options["oci"]["compartment_id"],
        vcn_kwargs=options["vcn"],
        subnet_kwargs=options["subnet"],
    )
Esempio n. 5
0
    def setUp(self):
        # Load compartment_id from the env
        prefix = ("oci", )
        oci_compartment_id = load_from_env_or_config(
            {"profile": {
                "compartment_id": {}
            }},
            prefix=gen_config_provider_prefix(prefix),
            throw=True,
        )

        oci_name = load_from_env_or_config(
            {"profile": {
                "name": {}
            }},
            prefix=gen_config_provider_prefix(prefix),
            throw=True,
        )

        self.oci_profile_options = {
            "compartment_id": oci_compartment_id,
            "name": oci_name,
        }

        test_name = "Test_VCN"
        vcn_name = test_name + "_Network"
        internet_gateway_name = test_name + "_Internet_Gateway"
        subnet_name = test_name + "_Subnet"

        # Add unique test postfix
        test_id = load_from_env_or_config(
            {"test": {
                "id": {}
            }}, prefix=gen_config_provider_prefix(prefix))
        if test_id:
            vcn_name += test_id
            internet_gateway_name += test_id
            subnet_name += test_id

        internet_gateway_options = dict(display_name=internet_gateway_name,
                                        is_enabled=True)
        route_table_options = dict(routerules=[
            dict(
                cidr_block=None,
                destination="0.0.0.0/0",
                destination_type="CIDR_BLOCK",
            )
        ])

        self.vcn_options = dict(
            cidr_block="10.0.0.0/16",
            display_name=vcn_name,
            dns_label="ku",
        )

        self.subnet_options = dict(display_name=subnet_name,
                                   dns_label="workers")

        self.options = dict(
            profile=self.oci_profile_options,
            vcn=self.vcn_options,
            internetgateway=internet_gateway_options,
            routetable=route_table_options,
            subnet=self.subnet_options,
        )

        self.network_client = new_client(
            VirtualNetworkClient,
            composite_class=VirtualNetworkClientCompositeOperations,
            name=self.options["profile"]["name"],
        )
Esempio n. 6
0
    def setUp(self):
        # Load compartment_id from the env
        prefix = ("oci", )
        oci_compartment_id = load_from_env_or_config(
            {"profile": {
                "compartment_id": {}
            }},
            prefix=gen_config_provider_prefix(prefix),
            throw=True,
        )

        oci_name = load_from_env_or_config(
            {"profile": {
                "name": {}
            }},
            prefix=gen_config_provider_prefix(prefix),
            throw=True,
        )
        oci_profile_options = {
            "compartment_id": oci_compartment_id,
            "name": oci_name
        }

        test_name = "Test_Cluster"
        cluster_name = test_name
        node_name = test_name + "_Node"
        vcn_name = test_name + "_Network"
        internet_gateway_name = test_name + "_Internet_Gateway"
        subnet_name = test_name + "_Subnet"

        # Add unique test postfix
        test_id = load_from_env_or_config(
            {"test": {
                "id": {}
            }}, prefix=gen_config_provider_prefix(prefix))
        if test_id:
            cluster_name += test_id
            node_name += test_id
            vcn_name += test_id
            internet_gateway_name += test_id
            subnet_name += test_id

        # Sort order in ascending to ensure that complex images
        # such as GPU powered shapes are not selected.
        # These are typically not supported by the cluster
        image_options = dict(
            operating_system="Oracle Linux",
            operating_system_version="7.8",
            limit="1",
            sort_order="ASC",
        )

        node_options = dict(
            availability_domain="lfcb:EU-FRANKFURT-1-AD-1",
            name=node_name,
            size=1,
            node_shape="VM.Standard1.1",
            image=image_options,
        )

        internet_gateway_options = dict(display_name=internet_gateway_name,
                                        is_enabled=True)
        route_table_options = dict(routerules=[
            dict(
                cidr_block=None,
                destination="0.0.0.0/0",
                destination_type="CIDR_BLOCK",
            )
        ])

        vcn_options = dict(
            cidr_block="10.0.0.0/16",
            display_name=vcn_name,
            dns_label="ku",
        )
        subnet_options = dict(cidr_block="10.0.2.0/24",
                              display_name=subnet_name,
                              dns_label="workers")

        self.container_engine_client = new_client(
            ContainerEngineClient,
            composite_class=ContainerEngineClientCompositeOperations,
            name=oci_name,
        )

        cluster_options = dict(
            name=cluster_name,
            kubernetes_version=get_kubernetes_version(
                self.container_engine_client),
            node=node_options,
        )

        self.compute_client = new_client(
            ComputeClient,
            composite_class=ComputeClientCompositeOperations,
            name=oci_name,
        )

        self.network_client = new_client(
            VirtualNetworkClient,
            composite_class=VirtualNetworkClientCompositeOperations,
            name=oci_name,
        )

        self.options = dict(
            profile=oci_profile_options,
            cluster=cluster_options,
            vcn=vcn_options,
            internetgateway=internet_gateway_options,
            routetable=route_table_options,
            subnet=subnet_options,
        )