Example #1
0
    def _transaction(
        self,
        project_uuid: str,
        run_config: Dict[str, Any],
        pipeline: Pipeline,
    ):
        # specify the task_id beforehand to avoid race conditions
        # between the task and its presence in the db
        task_id = str(uuid.uuid4())

        # NOTE: we are setting the status of the run ourselves without
        # using the option of celery to get the status of tasks. This
        # way we do not have to configure a backend (where the default
        # of "rpc://" does not give the results we would want).
        run = {
            "uuid": task_id,
            "pipeline_uuid": pipeline.properties["uuid"],
            "project_uuid": project_uuid,
            "status": "PENDING",
        }
        db.session.add(models.InteractivePipelineRun(**run))
        # need to flush because otherwise the bulk insertion of pipeline
        # steps will lead to foreign key errors
        # https://docs.sqlalchemy.org/en/13/orm/persistence_techniques.html#bulk-operations-caveats
        db.session.flush()

        # Set an initial value for the status of the pipeline steps that
        # will be run.
        step_uuids = [s.properties["uuid"] for s in pipeline.steps]

        pipeline_steps = []
        for step_uuid in step_uuids:
            pipeline_steps.append(
                models.PipelineRunStep(
                    **{
                        "run_uuid": task_id,
                        "step_uuid": step_uuid,
                        "status": "PENDING",
                    }
                )
            )
        db.session.bulk_save_objects(pipeline_steps)
        run["pipeline_steps"] = pipeline_steps

        self.collateral_kwargs["project_uuid"] = project_uuid
        self.collateral_kwargs["task_id"] = task_id
        self.collateral_kwargs["pipeline"] = pipeline
        self.collateral_kwargs["run_config"] = run_config
        self.collateral_kwargs["env_variables"] = get_proj_pip_env_variables(
            project_uuid, pipeline.properties["uuid"]
        )
        return run
Example #2
0
def _get_container_specs(
    uuid: str,
    project_uuid: str,
    pipeline_path: str,
    project_dir: str,
    host_userdir: str,
    network: str,
) -> Dict[str, dict]:
    """Constructs the container specifications for all resources.

    These specifications can be unpacked into the
    ``docker.client.DockerClient.containers.run`` method.

    Args:
        uuid: Some UUID to identify the session with. For interactive
            runs using the pipeline UUID is required, for non-
            interactive runs we recommend using the pipeline run UUID.
        project_uuid: UUID of the project.
        project_dir: Project directory w.r.t. the host. Needed to
            construct the mounts.
        host_userdir: Path to the userdir on the host
        network: Docker network. This is put directly into the specs, so
            that the containers are started on the specified network.

    Returns:
        Mapping from container name to container specification for the
        run method. The return dict looks as follows:
            container_specs = {
                'memory-server': spec dict,
                'jupyter-EG': spec dict,
                'jupyter-server': spec dict,
            }

    """
    # TODO: possibly add ``auto_remove=True`` to the specs.
    container_specs = {}
    mounts = _get_mounts(uuid, project_uuid, project_dir, host_userdir)

    container_specs["memory-server"] = {
        "image": "orchest/memory-server:latest",
        "detach": True,
        "mounts": [mounts["project_dir"], mounts["temp_volume"]],
        # TODO: name not unique... and uuid cannot be used.
        "name": f"memory-server-{project_uuid}-{uuid}",
        "network": network,
        # Set a ridiculous shm size and let plasma determine how much
        # it wants to consume (according to the setting in the pipeline
        # definition). Mounting `/dev/shm` directly is not supported on
        # Mac.
        "shm_size": "1000G",
        "environment": [
            f"ORCHEST_PIPELINE_PATH={pipeline_path}",
        ],
        # Labels are used to have a way of keeping track of the
        # containers attributes through ``Session.from_container_IDs``
        "labels": {
            "session_identity_uuid": uuid,
            "project_uuid": project_uuid
        },
    }

    # Run EG container, where EG_DOCKER_NETWORK ensures that kernels
    # started by the EG are on the same docker network as the EG.
    gateway_hostname = _config.JUPYTER_EG_SERVER_NAME.format(
        project_uuid=project_uuid[:_config.TRUNCATED_UUID_LENGTH],
        pipeline_uuid=uuid[:_config.TRUNCATED_UUID_LENGTH],
    )

    # Get user configured environment variables for EG,
    # to pass to Jupyter kernels.
    try:
        env_variables = utils.get_proj_pip_env_variables(project_uuid, uuid)
    except Exception:
        # TODO: refactor _get_container_specs to be split up
        # in noninteractive and interactive container_specs.
        # In Celery no app context is available so user
        # defined environment variables cannot be retrieved.
        env_variables = {}

    user_defined_env_vars = [
        f"{key}={value}" for key, value in env_variables.items()
    ]

    process_env_whitelist = ("EG_ENV_PROCESS_WHITELIST=ORCHEST_PIPELINE_UUID,"
                             "ORCHEST_PIPELINE_PATH,"
                             "ORCHEST_PROJECT_UUID,"
                             "ORCHEST_HOST_PROJECT_DIR,"
                             "ORCHEST_HOST_GID,")
    process_env_whitelist += ",".join([key for key in env_variables.keys()])

    container_specs["jupyter-EG"] = {
        "image":
        "orchest/jupyter-enterprise-gateway",  # TODO: make not static.
        "detach":
        True,
        "mounts": [mounts.get("docker_sock"),
                   mounts.get("kernelspec")],
        "name":
        gateway_hostname,
        "environment": [
            f"EG_DOCKER_NETWORK={network}",
            "EG_MIRROR_WORKING_DIRS=True",
            "EG_LIST_KERNELS=True",
            "EG_KERNEL_WHITELIST=[]",
            "EG_PROHIBITED_UIDS=[]",
            'EG_UNAUTHORIZED_USERS=["dummy"]',
            'EG_UID_BLACKLIST=["-1"]',
            "EG_ALLOW_ORIGIN=*",
            process_env_whitelist,
            f"ORCHEST_PIPELINE_UUID={uuid}",
            f"ORCHEST_PIPELINE_PATH={pipeline_path}",
            f"ORCHEST_PROJECT_UUID={project_uuid}",
            f"ORCHEST_HOST_PROJECT_DIR={project_dir}",
            f'ORCHEST_HOST_GID={os.environ.get("ORCHEST_HOST_GID")}',
        ] + user_defined_env_vars,
        "user":
        "******",
        "network":
        network,
        # Labels are used to have a way of keeping track of the
        # containers attributes through ``Session.from_container_IDs``
        "labels": {
            "session_identity_uuid": uuid,
            "project_uuid": project_uuid
        },
    }

    jupyter_hostname = _config.JUPYTER_SERVER_NAME.format(
        project_uuid=project_uuid[:_config.TRUNCATED_UUID_LENGTH],
        pipeline_uuid=uuid[:_config.TRUNCATED_UUID_LENGTH],
    )
    # Run Jupyter server container.
    container_specs["jupyter-server"] = {
        "image":
        "orchest/jupyter-server:latest",  # TODO: make not static.
        "detach":
        True,
        "mounts": [
            mounts["project_dir"],
            mounts["jupyterlab"].get("lab"),
            mounts["jupyterlab"].get("user-settings"),
            mounts["jupyterlab"].get("data"),
        ],
        "name":
        jupyter_hostname,
        "network":
        network,
        "group_add": [os.environ.get("ORCHEST_HOST_GID")],
        "command": [
            "--allow-root",
            "--port=8888",
            "--no-browser",
            f"--gateway-url={'http://' + gateway_hostname}:8888",
            f"--notebook-dir={_config.PROJECT_DIR}",
            f"--ServerApp.base_url=/{jupyter_hostname}",
        ],
        # Labels are used to have a way of keeping track of the
        # containers attributes through ``Session.from_container_IDs``
        "labels": {
            "session_identity_uuid": uuid,
            "project_uuid": project_uuid
        },
    }

    return container_specs
Example #3
0
    def post(self):
        """Queues a new job."""
        # TODO: possibly use marshal() on the post_data. Note that we
        # have moved over to using flask_restx
        # https://flask-restx.readthedocs.io/en/stable/api.html#flask_restx.marshal
        #       to make sure the default values etc. are filled in.

        try:
            post_data = request.get_json()

            scheduled_start = post_data.get("scheduled_start", None)
            cron_schedule = post_data.get("cron_schedule", None)

            # To be scheduled ASAP and to be run once.
            if cron_schedule is None and scheduled_start is None:
                next_scheduled_time = None

            # To be scheduled according to argument, to be run once.
            elif cron_schedule is None:
                # Expected to be UTC.
                next_scheduled_time = datetime.fromisoformat(scheduled_start)

            # To follow a cron schedule. To be run an indefinite amount
            # of times.
            elif cron_schedule is not None and scheduled_start is None:
                if not croniter.is_valid(cron_schedule):
                    raise ValueError(f"Invalid cron schedule: {cron_schedule}")

                # Check when is the next time the job should be
                # scheduled starting from now.
                next_scheduled_time = croniter(
                    cron_schedule,
                    datetime.now(timezone.utc)).get_next(datetime)

            else:
                raise ValueError(
                    "Can't define both cron_schedule and scheduled_start.")

            job = {
                "uuid":
                post_data["uuid"],
                "name":
                post_data["name"],
                "project_uuid":
                post_data["project_uuid"],
                "pipeline_uuid":
                post_data["pipeline_uuid"],
                "pipeline_name":
                post_data["pipeline_name"],
                "schedule":
                cron_schedule,
                "parameters":
                post_data["parameters"],
                "env_variables":
                get_proj_pip_env_variables(post_data["project_uuid"],
                                           post_data["pipeline_uuid"]),
                "pipeline_definition":
                post_data["pipeline_definition"],
                "pipeline_run_spec":
                post_data["pipeline_run_spec"],
                "total_scheduled_executions":
                0,
                "next_scheduled_time":
                next_scheduled_time,
                "status":
                "DRAFT",
                "strategy_json":
                post_data.get("strategy_json", {}),
                "created_time":
                datetime.now(timezone.utc),
            }
            db.session.add(models.Job(**job))
            db.session.commit()

        except Exception as e:
            db.session.rollback()
            current_app.logger.error(e)
            return {"message": str(e)}, 500

        return marshal(job, schema.job), 201
Example #4
0
def _get_user_service_deployment_service_manifest(
    session_uuid: str,
    session_config: SessionConfig,
    service_config: Dict[str, Any],
    session_type: SessionType,
) -> Tuple[Dict[str, Any], Dict[str, Any], Optional[Dict[str, Any]]]:
    """Get deployment and service manifest for a user service.

    Args:
        session_uuid:
        session_config: See `Args` section in class :class:`Session`
            __init__ method.
        service_config: See `Args` section in class :class:`Session`
            __init__ method.
        session_type: Type of session: interactive, or
            noninteractive.

    Returns:
        Tuple of k8s deployment, service and ingress manifests to deploy
        this user service in the session. The ingress is None if
        service.exposed is False.

    """
    project_uuid = session_config["project_uuid"]
    pipeline_uuid = session_config["pipeline_uuid"]
    pipeline_path = session_config["pipeline_path"]
    project_dir = session_config["project_dir"]
    userdir_pvc = session_config["userdir_pvc"]
    img_mappings = session_config["env_uuid_to_image"]
    session_type = session_type.value

    # Template section
    is_pbp_enabled = service_config.get("preserve_base_path", False)
    ingress_url = "service-" + service_config["name"] + "-" + session_uuid
    if is_pbp_enabled:
        ingress_url = "pbp-" + ingress_url

    # Replace $BASE_PATH_PREFIX with service_base_url.  NOTE:
    # this substitution happens after service_config["name"] is read,
    # so that JSON entry does not support $BASE_PATH_PREFIX
    # substitution.  This allows the user to specify
    # $BASE_PATH_PREFIX as the value of an env variable, so that
    # the base path can be passsed dynamically to the service.
    service_str = json.dumps(service_config)
    service_str = service_str.replace("$BASE_PATH_PREFIX", ingress_url)
    service_config = json.loads(service_str)
    # End template section

    # Get user configured environment variables
    try:
        if session_type == "noninteractive":
            # Get job environment variable overrides
            user_env_variables = session_config["user_env_variables"]
        else:
            user_env_variables = utils.get_proj_pip_env_variables(
                project_uuid, pipeline_uuid)
    except Exception as e:

        logger.error("Failed to fetch user_env_variables: %s [%s]" %
                     (e, type(e)))

        traceback.print_exc()

        user_env_variables = {}

    environment = service_config.get("env_variables", {})
    # Inherited env vars supersede inherited ones.
    for inherited_key in service_config.get("env_variables_inherit", []):
        if inherited_key in user_env_variables:
            environment[inherited_key] = user_env_variables[inherited_key]

    # These are all required for the Orchest SDK to work.
    environment["ORCHEST_PROJECT_UUID"] = project_uuid
    environment["ORCHEST_PIPELINE_UUID"] = pipeline_uuid
    # So that the SDK can access the pipeline file.
    environment["ORCHEST_PIPELINE_PATH"] = _config.PIPELINE_FILE
    environment["ORCHEST_SESSION_UUID"] = session_uuid
    environment["ORCHEST_SESSION_TYPE"] = session_type
    env = []
    for k, v in environment.items():
        env.append({"name": k, "value": v})

    volume_mounts = []
    volumes = []
    sbinds = service_config.get("binds", {})
    volumes_dict, volume_mounts_dict = _get_common_volumes_and_volume_mounts(
        userdir_pvc,
        project_dir,
        pipeline_path,
        container_project_dir=sbinds.get("/project-dir", _config.PROJECT_DIR),
        container_data_dir=sbinds.get("/data", _config.DATA_DIR),
    )
    # Can be later extended into adding a Mount for every "custom"
    # key, e.g. key != data and key != project_directory.
    if "/data" in sbinds:
        volume_mounts.append(volume_mounts_dict["data"])
    if "/project-dir" in sbinds:
        volume_mounts.append(volume_mounts_dict["project-dir"])
    if "/data" in sbinds or "/project-dir" in sbinds:
        volumes.append(volumes_dict["userdir-pvc"])

    # To support orchest environments as services.
    image = service_config["image"]
    prefix = _config.ENVIRONMENT_AS_SERVICE_PREFIX
    if image.startswith(prefix):
        # Need to reference the ip because the local docker engine will
        # run the container, and if the image is missing it will prompt
        # a pull which will fail because the FQDN can't be resolved by
        # the local engine on the node. K8S_TODO: fix this.
        registry_ip = k8s_core_api.read_namespaced_service(
            _config.REGISTRY, _config.ORCHEST_NAMESPACE).spec.cluster_ip

        image = image.replace(prefix, "")
        image = img_mappings[image]
        image = registry_ip + "/" + image

    metadata = {
        "name": service_config["name"] + "-" + session_uuid,
        "labels": {
            "app": service_config["name"],
            "project_uuid": project_uuid,
            "session_uuid": session_uuid,
        },
    }

    deployment_manifest = {
        "apiVersion": "apps/v1",
        "kind": "Deployment",
        "metadata": metadata,
        "spec": {
            "replicas": 1,
            "selector": {
                "matchLabels": metadata["labels"]
            },
            "template": {
                "metadata": metadata,
                "spec": {
                    "terminationGracePeriodSeconds":
                    5,
                    "securityContext": {
                        "runAsUser": 0,
                        "runAsGroup": int(os.environ.get("ORCHEST_HOST_GID")),
                        "fsGroup": int(os.environ.get("ORCHEST_HOST_GID")),
                    },
                    "resources": {
                        "requests": {
                            "cpu": _config.USER_CONTAINERS_CPU_SHARES
                        }
                    },
                    "volumes":
                    volumes,
                    "containers": [{
                        "name":
                        metadata["name"],
                        "image":
                        image,
                        "imagePullPolicy":
                        "IfNotPresent",
                        "env":
                        env,
                        "volumeMounts":
                        volume_mounts,
                        "ports": [{
                            "containerPort": port
                        } for port in service_config["ports"]],
                    }],
                },
            },
        },
    }

    # K8S doesn't like empty commands.
    if service_config.get("command", ""):
        deployment_manifest["spec"]["template"]["spec"]["containers"][0][
            "command"] = [service_config["command"]]

    if "args" in service_config:
        deployment_manifest["spec"]["template"]["spec"]["containers"][0][
            "args"] = shlex.split(service_config["args"])

    service_manifest = {
        "apiVersion": "v1",
        "kind": "Service",
        "metadata": metadata,
        "spec": {
            "type": "ClusterIP",
            "selector": metadata["labels"],
            "ports": [{
                "port": port
            } for port in service_config["ports"]],
        },
    }

    if service_config["exposed"]:
        ingress_paths = []
        for port in service_config.get("ports", []):
            ingress_paths.append({
                "backend": {
                    "service": {
                        "name": metadata["name"],
                        "port": {
                            "number": port
                        },
                    }
                },
                "path":
                f"/({ingress_url}_{port}.*)"
                if is_pbp_enabled else f"/{ingress_url}_{port}(/|$)(.*)",
                "pathType":
                "Prefix",
            })

        ingress_metadata = copy.deepcopy(metadata)

        # Decide rewrite target based on pbp
        ingress_metadata["annotations"] = {
            "nginx.ingress.kubernetes.io/rewrite-target":
            "/$1" if is_pbp_enabled else "/$2",
        }

        if service_config.get("requires_authentication", True):
            # Needs to be the FQDN since the ingress ngin pod lives in
            # a different namespace.
            auth_url = (
                f"http://auth-server.{_config.ORCHEST_NAMESPACE}.svc.cluster.local/auth"
            )
            ingress_metadata["annotations"][
                "nginx.ingress.kubernetes.io/auth-url"] = auth_url
            ingress_metadata["annotations"][
                "nginx.ingress.kubernetes.io/auth-signin"] = "/login"

        ingress_rule = {}
        if _config.ORCHEST_FQDN is not None:
            ingress_rule["host"] = _config.ORCHEST_FQDN
        ingress_rule["http"] = {"paths": ingress_paths}

        ingress_manifest = {
            "apiVersion": "networking.k8s.io/v1",
            "kind": "Ingress",
            "metadata": ingress_metadata,
            "spec": {
                "ingressClassName": "nginx",
                "rules": [ingress_rule],
            },
        }
    else:
        ingress_manifest = None

    return deployment_manifest, service_manifest, ingress_manifest
Example #5
0
def _get_jupyter_enterprise_gateway_deployment_service_manifest(
    session_uuid: str,
    session_config: SessionConfig,
    session_type: SessionType,
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    project_uuid = session_config["project_uuid"]
    pipeline_uuid = session_config["pipeline_uuid"]
    pipeline_path = session_config["pipeline_path"]
    project_dir = session_config["project_dir"]
    userdir_pvc = session_config["userdir_pvc"]
    session_type = session_type.value

    metadata = {
        "name": f"jupyter-eg-{session_uuid}",
        "labels": {
            "app": "jupyter-eg",
            "project_uuid": project_uuid,
            "session_uuid": session_uuid,
        },
    }

    # Get user environment variables to pass to Jupyter kernels.
    try:
        user_defined_env_vars = utils.get_proj_pip_env_variables(
            project_uuid, pipeline_uuid)

        # NOTE: Don't allow users to specify change the `PATH` as it
        # could break user code execution. The `PATH` var is removed
        # when starting kernels through the jupyter-EG as well.
        user_defined_env_vars.pop("PATH", None)
    except Exception:
        user_defined_env_vars = {}

    process_env_whitelist = [
        "ORCHEST_PIPELINE_UUID",
        "ORCHEST_PIPELINE_PATH",
        "ORCHEST_PROJECT_UUID",
        "ORCHEST_USERDIR_PVC",
        "ORCHEST_PROJECT_DIR",
        "ORCHEST_PIPELINE_FILE",
        "ORCHEST_HOST_GID",
        "ORCHEST_SESSION_UUID",
        "ORCHEST_SESSION_TYPE",
        "ORCHEST_GPU_ENABLED_INSTANCE",
        "ORCHEST_REGISTRY",
    ]
    process_env_whitelist.extend(list(user_defined_env_vars.keys()))
    process_env_whitelist = ",".join(process_env_whitelist)

    # Need to reference the ip because the local docker engine will
    # run the container, and if the image is missing it will prompt
    # a pull which will fail because the FQDN can't be resolved by
    # the local engine on the node. K8S_TODO: fix this.
    registry_ip = k8s_core_api.read_namespaced_service(
        _config.REGISTRY, _config.ORCHEST_NAMESPACE).spec.cluster_ip
    environment = {
        "EG_MIRROR_WORKING_DIRS": "True",
        "EG_LIST_KERNELS": "True",
        "EG_KERNEL_WHITELIST": "[]",
        "EG_PROHIBITED_UIDS": "[]",
        "EG_UNAUTHORIZED_USERS": '["dummy"]',
        "EG_UID_BLACKLIST": '["-1"]',
        "EG_ALLOW_ORIGIN": "*",
        "EG_BASE_URL": f"/jupyter-server-{session_uuid}",
        # This is because images might need to be pulled on the node and
        # we aren't using a dameon or similar to pull images on the
        # node.  See kernel-image-puller (KIP) for such an example.
        "EG_KERNEL_LAUNCH_TIMEOUT": "600",
        "EG_ENV_PROCESS_WHITELIST": process_env_whitelist,
        # Note: the docs say to use a string, but the script in charge
        # of launching the kernel will expect an integer and fail!.
        "EG_LOG_LEVEL": "10",
        # "All kernels reside in the EG namespace if true, otherwise
        # KERNEL_NAMESPACE must be provided or one will be created for
        # each kernel."
        "EG_NAMESPACE": _config.ORCHEST_NAMESPACE,
        "EG_SHARED_NAMESPACE": "True",
        "ORCHEST_PIPELINE_UUID": pipeline_uuid,
        "ORCHEST_PIPELINE_PATH": _config.PIPELINE_FILE,
        "ORCHEST_PROJECT_UUID": project_uuid,
        "ORCHEST_USERDIR_PVC": userdir_pvc,
        "ORCHEST_PROJECT_DIR": project_dir,
        "ORCHEST_PIPELINE_FILE": pipeline_path,
        "ORCHEST_HOST_GID": os.environ.get("ORCHEST_HOST_GID"),
        "ORCHEST_SESSION_UUID": session_uuid,
        "ORCHEST_SESSION_TYPE": session_type,
        "ORCHEST_GPU_ENABLED_INSTANCE": str(CONFIG_CLASS.GPU_ENABLED_INSTANCE),
        "ORCHEST_REGISTRY": registry_ip,
    }
    environment = [{"name": k, "value": v} for k, v in environment.items()]
    user_defined_env_vars = [{
        "name": key,
        "value": value
    } for key, value in user_defined_env_vars.items()]
    environment.extend(user_defined_env_vars)

    volumes_dict, volume_mounts_dict = _get_jupyter_volumes_and_volume_mounts(
        project_uuid, userdir_pvc, project_dir, pipeline_path)

    deployment_manifest = {
        "apiVersion": "apps/v1",
        "kind": "Deployment",
        "metadata": metadata,
        "spec": {
            "replicas": 1,
            "selector": {
                "matchLabels": metadata["labels"]
            },
            "template": {
                "metadata": metadata,
                "spec": {
                    "securityContext": {
                        "runAsUser": 0,
                        "runAsGroup": int(os.environ.get("ORCHEST_HOST_GID")),
                        "fsGroup": int(os.environ.get("ORCHEST_HOST_GID")),
                    },
                    "serviceAccount":
                    f"jupyter-eg-sa-{session_uuid}",
                    "serviceAccountName":
                    f"jupyter-eg-sa-{session_uuid}",
                    "terminationGracePeriodSeconds":
                    5,
                    "resources": {
                        "requests": {
                            "cpu": _config.USER_CONTAINERS_CPU_SHARES
                        }
                    },
                    "volumes": [
                        volumes_dict["userdir-pvc"],
                    ],
                    "containers": [{
                        "name":
                        metadata["name"],
                        "image": ("orchest/jupyter-enterprise-gateway:" +
                                  CONFIG_CLASS.ORCHEST_VERSION),
                        "imagePullPolicy":
                        "IfNotPresent",
                        "env":
                        environment,
                        "volumeMounts": [
                            volume_mounts_dict["kernelspec"],
                        ],
                        "ports": [{
                            "containerPort": 8888
                        }],
                    }],
                },
            },
        },
    }

    service_manifest = {
        "apiVersion": "v1",
        "kind": "Service",
        "metadata": metadata,
        "spec": {
            "type": "ClusterIP",
            "selector": metadata["labels"],
            "ports": [{
                "port": 8888
            }],
        },
    }
    return deployment_manifest, service_manifest