def _transaction( self, project_uuid: str, run_config: Dict[str, Any], pipeline: Pipeline, ): # specify the task_id beforehand to avoid race conditions # between the task and its presence in the db task_id = str(uuid.uuid4()) # NOTE: we are setting the status of the run ourselves without # using the option of celery to get the status of tasks. This # way we do not have to configure a backend (where the default # of "rpc://" does not give the results we would want). run = { "uuid": task_id, "pipeline_uuid": pipeline.properties["uuid"], "project_uuid": project_uuid, "status": "PENDING", } db.session.add(models.InteractivePipelineRun(**run)) # need to flush because otherwise the bulk insertion of pipeline # steps will lead to foreign key errors # https://docs.sqlalchemy.org/en/13/orm/persistence_techniques.html#bulk-operations-caveats db.session.flush() # Set an initial value for the status of the pipeline steps that # will be run. step_uuids = [s.properties["uuid"] for s in pipeline.steps] pipeline_steps = [] for step_uuid in step_uuids: pipeline_steps.append( models.PipelineRunStep( **{ "run_uuid": task_id, "step_uuid": step_uuid, "status": "PENDING", } ) ) db.session.bulk_save_objects(pipeline_steps) run["pipeline_steps"] = pipeline_steps self.collateral_kwargs["project_uuid"] = project_uuid self.collateral_kwargs["task_id"] = task_id self.collateral_kwargs["pipeline"] = pipeline self.collateral_kwargs["run_config"] = run_config self.collateral_kwargs["env_variables"] = get_proj_pip_env_variables( project_uuid, pipeline.properties["uuid"] ) return run
def _get_container_specs( uuid: str, project_uuid: str, pipeline_path: str, project_dir: str, host_userdir: str, network: str, ) -> Dict[str, dict]: """Constructs the container specifications for all resources. These specifications can be unpacked into the ``docker.client.DockerClient.containers.run`` method. Args: uuid: Some UUID to identify the session with. For interactive runs using the pipeline UUID is required, for non- interactive runs we recommend using the pipeline run UUID. project_uuid: UUID of the project. project_dir: Project directory w.r.t. the host. Needed to construct the mounts. host_userdir: Path to the userdir on the host network: Docker network. This is put directly into the specs, so that the containers are started on the specified network. Returns: Mapping from container name to container specification for the run method. The return dict looks as follows: container_specs = { 'memory-server': spec dict, 'jupyter-EG': spec dict, 'jupyter-server': spec dict, } """ # TODO: possibly add ``auto_remove=True`` to the specs. container_specs = {} mounts = _get_mounts(uuid, project_uuid, project_dir, host_userdir) container_specs["memory-server"] = { "image": "orchest/memory-server:latest", "detach": True, "mounts": [mounts["project_dir"], mounts["temp_volume"]], # TODO: name not unique... and uuid cannot be used. "name": f"memory-server-{project_uuid}-{uuid}", "network": network, # Set a ridiculous shm size and let plasma determine how much # it wants to consume (according to the setting in the pipeline # definition). Mounting `/dev/shm` directly is not supported on # Mac. "shm_size": "1000G", "environment": [ f"ORCHEST_PIPELINE_PATH={pipeline_path}", ], # Labels are used to have a way of keeping track of the # containers attributes through ``Session.from_container_IDs`` "labels": { "session_identity_uuid": uuid, "project_uuid": project_uuid }, } # Run EG container, where EG_DOCKER_NETWORK ensures that kernels # started by the EG are on the same docker network as the EG. gateway_hostname = _config.JUPYTER_EG_SERVER_NAME.format( project_uuid=project_uuid[:_config.TRUNCATED_UUID_LENGTH], pipeline_uuid=uuid[:_config.TRUNCATED_UUID_LENGTH], ) # Get user configured environment variables for EG, # to pass to Jupyter kernels. try: env_variables = utils.get_proj_pip_env_variables(project_uuid, uuid) except Exception: # TODO: refactor _get_container_specs to be split up # in noninteractive and interactive container_specs. # In Celery no app context is available so user # defined environment variables cannot be retrieved. env_variables = {} user_defined_env_vars = [ f"{key}={value}" for key, value in env_variables.items() ] process_env_whitelist = ("EG_ENV_PROCESS_WHITELIST=ORCHEST_PIPELINE_UUID," "ORCHEST_PIPELINE_PATH," "ORCHEST_PROJECT_UUID," "ORCHEST_HOST_PROJECT_DIR," "ORCHEST_HOST_GID,") process_env_whitelist += ",".join([key for key in env_variables.keys()]) container_specs["jupyter-EG"] = { "image": "orchest/jupyter-enterprise-gateway", # TODO: make not static. "detach": True, "mounts": [mounts.get("docker_sock"), mounts.get("kernelspec")], "name": gateway_hostname, "environment": [ f"EG_DOCKER_NETWORK={network}", "EG_MIRROR_WORKING_DIRS=True", "EG_LIST_KERNELS=True", "EG_KERNEL_WHITELIST=[]", "EG_PROHIBITED_UIDS=[]", 'EG_UNAUTHORIZED_USERS=["dummy"]', 'EG_UID_BLACKLIST=["-1"]', "EG_ALLOW_ORIGIN=*", process_env_whitelist, f"ORCHEST_PIPELINE_UUID={uuid}", f"ORCHEST_PIPELINE_PATH={pipeline_path}", f"ORCHEST_PROJECT_UUID={project_uuid}", f"ORCHEST_HOST_PROJECT_DIR={project_dir}", f'ORCHEST_HOST_GID={os.environ.get("ORCHEST_HOST_GID")}', ] + user_defined_env_vars, "user": "******", "network": network, # Labels are used to have a way of keeping track of the # containers attributes through ``Session.from_container_IDs`` "labels": { "session_identity_uuid": uuid, "project_uuid": project_uuid }, } jupyter_hostname = _config.JUPYTER_SERVER_NAME.format( project_uuid=project_uuid[:_config.TRUNCATED_UUID_LENGTH], pipeline_uuid=uuid[:_config.TRUNCATED_UUID_LENGTH], ) # Run Jupyter server container. container_specs["jupyter-server"] = { "image": "orchest/jupyter-server:latest", # TODO: make not static. "detach": True, "mounts": [ mounts["project_dir"], mounts["jupyterlab"].get("lab"), mounts["jupyterlab"].get("user-settings"), mounts["jupyterlab"].get("data"), ], "name": jupyter_hostname, "network": network, "group_add": [os.environ.get("ORCHEST_HOST_GID")], "command": [ "--allow-root", "--port=8888", "--no-browser", f"--gateway-url={'http://' + gateway_hostname}:8888", f"--notebook-dir={_config.PROJECT_DIR}", f"--ServerApp.base_url=/{jupyter_hostname}", ], # Labels are used to have a way of keeping track of the # containers attributes through ``Session.from_container_IDs`` "labels": { "session_identity_uuid": uuid, "project_uuid": project_uuid }, } return container_specs
def post(self): """Queues a new job.""" # TODO: possibly use marshal() on the post_data. Note that we # have moved over to using flask_restx # https://flask-restx.readthedocs.io/en/stable/api.html#flask_restx.marshal # to make sure the default values etc. are filled in. try: post_data = request.get_json() scheduled_start = post_data.get("scheduled_start", None) cron_schedule = post_data.get("cron_schedule", None) # To be scheduled ASAP and to be run once. if cron_schedule is None and scheduled_start is None: next_scheduled_time = None # To be scheduled according to argument, to be run once. elif cron_schedule is None: # Expected to be UTC. next_scheduled_time = datetime.fromisoformat(scheduled_start) # To follow a cron schedule. To be run an indefinite amount # of times. elif cron_schedule is not None and scheduled_start is None: if not croniter.is_valid(cron_schedule): raise ValueError(f"Invalid cron schedule: {cron_schedule}") # Check when is the next time the job should be # scheduled starting from now. next_scheduled_time = croniter( cron_schedule, datetime.now(timezone.utc)).get_next(datetime) else: raise ValueError( "Can't define both cron_schedule and scheduled_start.") job = { "uuid": post_data["uuid"], "name": post_data["name"], "project_uuid": post_data["project_uuid"], "pipeline_uuid": post_data["pipeline_uuid"], "pipeline_name": post_data["pipeline_name"], "schedule": cron_schedule, "parameters": post_data["parameters"], "env_variables": get_proj_pip_env_variables(post_data["project_uuid"], post_data["pipeline_uuid"]), "pipeline_definition": post_data["pipeline_definition"], "pipeline_run_spec": post_data["pipeline_run_spec"], "total_scheduled_executions": 0, "next_scheduled_time": next_scheduled_time, "status": "DRAFT", "strategy_json": post_data.get("strategy_json", {}), "created_time": datetime.now(timezone.utc), } db.session.add(models.Job(**job)) db.session.commit() except Exception as e: db.session.rollback() current_app.logger.error(e) return {"message": str(e)}, 500 return marshal(job, schema.job), 201
def _get_user_service_deployment_service_manifest( session_uuid: str, session_config: SessionConfig, service_config: Dict[str, Any], session_type: SessionType, ) -> Tuple[Dict[str, Any], Dict[str, Any], Optional[Dict[str, Any]]]: """Get deployment and service manifest for a user service. Args: session_uuid: session_config: See `Args` section in class :class:`Session` __init__ method. service_config: See `Args` section in class :class:`Session` __init__ method. session_type: Type of session: interactive, or noninteractive. Returns: Tuple of k8s deployment, service and ingress manifests to deploy this user service in the session. The ingress is None if service.exposed is False. """ project_uuid = session_config["project_uuid"] pipeline_uuid = session_config["pipeline_uuid"] pipeline_path = session_config["pipeline_path"] project_dir = session_config["project_dir"] userdir_pvc = session_config["userdir_pvc"] img_mappings = session_config["env_uuid_to_image"] session_type = session_type.value # Template section is_pbp_enabled = service_config.get("preserve_base_path", False) ingress_url = "service-" + service_config["name"] + "-" + session_uuid if is_pbp_enabled: ingress_url = "pbp-" + ingress_url # Replace $BASE_PATH_PREFIX with service_base_url. NOTE: # this substitution happens after service_config["name"] is read, # so that JSON entry does not support $BASE_PATH_PREFIX # substitution. This allows the user to specify # $BASE_PATH_PREFIX as the value of an env variable, so that # the base path can be passsed dynamically to the service. service_str = json.dumps(service_config) service_str = service_str.replace("$BASE_PATH_PREFIX", ingress_url) service_config = json.loads(service_str) # End template section # Get user configured environment variables try: if session_type == "noninteractive": # Get job environment variable overrides user_env_variables = session_config["user_env_variables"] else: user_env_variables = utils.get_proj_pip_env_variables( project_uuid, pipeline_uuid) except Exception as e: logger.error("Failed to fetch user_env_variables: %s [%s]" % (e, type(e))) traceback.print_exc() user_env_variables = {} environment = service_config.get("env_variables", {}) # Inherited env vars supersede inherited ones. for inherited_key in service_config.get("env_variables_inherit", []): if inherited_key in user_env_variables: environment[inherited_key] = user_env_variables[inherited_key] # These are all required for the Orchest SDK to work. environment["ORCHEST_PROJECT_UUID"] = project_uuid environment["ORCHEST_PIPELINE_UUID"] = pipeline_uuid # So that the SDK can access the pipeline file. environment["ORCHEST_PIPELINE_PATH"] = _config.PIPELINE_FILE environment["ORCHEST_SESSION_UUID"] = session_uuid environment["ORCHEST_SESSION_TYPE"] = session_type env = [] for k, v in environment.items(): env.append({"name": k, "value": v}) volume_mounts = [] volumes = [] sbinds = service_config.get("binds", {}) volumes_dict, volume_mounts_dict = _get_common_volumes_and_volume_mounts( userdir_pvc, project_dir, pipeline_path, container_project_dir=sbinds.get("/project-dir", _config.PROJECT_DIR), container_data_dir=sbinds.get("/data", _config.DATA_DIR), ) # Can be later extended into adding a Mount for every "custom" # key, e.g. key != data and key != project_directory. if "/data" in sbinds: volume_mounts.append(volume_mounts_dict["data"]) if "/project-dir" in sbinds: volume_mounts.append(volume_mounts_dict["project-dir"]) if "/data" in sbinds or "/project-dir" in sbinds: volumes.append(volumes_dict["userdir-pvc"]) # To support orchest environments as services. image = service_config["image"] prefix = _config.ENVIRONMENT_AS_SERVICE_PREFIX if image.startswith(prefix): # Need to reference the ip because the local docker engine will # run the container, and if the image is missing it will prompt # a pull which will fail because the FQDN can't be resolved by # the local engine on the node. K8S_TODO: fix this. registry_ip = k8s_core_api.read_namespaced_service( _config.REGISTRY, _config.ORCHEST_NAMESPACE).spec.cluster_ip image = image.replace(prefix, "") image = img_mappings[image] image = registry_ip + "/" + image metadata = { "name": service_config["name"] + "-" + session_uuid, "labels": { "app": service_config["name"], "project_uuid": project_uuid, "session_uuid": session_uuid, }, } deployment_manifest = { "apiVersion": "apps/v1", "kind": "Deployment", "metadata": metadata, "spec": { "replicas": 1, "selector": { "matchLabels": metadata["labels"] }, "template": { "metadata": metadata, "spec": { "terminationGracePeriodSeconds": 5, "securityContext": { "runAsUser": 0, "runAsGroup": int(os.environ.get("ORCHEST_HOST_GID")), "fsGroup": int(os.environ.get("ORCHEST_HOST_GID")), }, "resources": { "requests": { "cpu": _config.USER_CONTAINERS_CPU_SHARES } }, "volumes": volumes, "containers": [{ "name": metadata["name"], "image": image, "imagePullPolicy": "IfNotPresent", "env": env, "volumeMounts": volume_mounts, "ports": [{ "containerPort": port } for port in service_config["ports"]], }], }, }, }, } # K8S doesn't like empty commands. if service_config.get("command", ""): deployment_manifest["spec"]["template"]["spec"]["containers"][0][ "command"] = [service_config["command"]] if "args" in service_config: deployment_manifest["spec"]["template"]["spec"]["containers"][0][ "args"] = shlex.split(service_config["args"]) service_manifest = { "apiVersion": "v1", "kind": "Service", "metadata": metadata, "spec": { "type": "ClusterIP", "selector": metadata["labels"], "ports": [{ "port": port } for port in service_config["ports"]], }, } if service_config["exposed"]: ingress_paths = [] for port in service_config.get("ports", []): ingress_paths.append({ "backend": { "service": { "name": metadata["name"], "port": { "number": port }, } }, "path": f"/({ingress_url}_{port}.*)" if is_pbp_enabled else f"/{ingress_url}_{port}(/|$)(.*)", "pathType": "Prefix", }) ingress_metadata = copy.deepcopy(metadata) # Decide rewrite target based on pbp ingress_metadata["annotations"] = { "nginx.ingress.kubernetes.io/rewrite-target": "/$1" if is_pbp_enabled else "/$2", } if service_config.get("requires_authentication", True): # Needs to be the FQDN since the ingress ngin pod lives in # a different namespace. auth_url = ( f"http://auth-server.{_config.ORCHEST_NAMESPACE}.svc.cluster.local/auth" ) ingress_metadata["annotations"][ "nginx.ingress.kubernetes.io/auth-url"] = auth_url ingress_metadata["annotations"][ "nginx.ingress.kubernetes.io/auth-signin"] = "/login" ingress_rule = {} if _config.ORCHEST_FQDN is not None: ingress_rule["host"] = _config.ORCHEST_FQDN ingress_rule["http"] = {"paths": ingress_paths} ingress_manifest = { "apiVersion": "networking.k8s.io/v1", "kind": "Ingress", "metadata": ingress_metadata, "spec": { "ingressClassName": "nginx", "rules": [ingress_rule], }, } else: ingress_manifest = None return deployment_manifest, service_manifest, ingress_manifest
def _get_jupyter_enterprise_gateway_deployment_service_manifest( session_uuid: str, session_config: SessionConfig, session_type: SessionType, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: project_uuid = session_config["project_uuid"] pipeline_uuid = session_config["pipeline_uuid"] pipeline_path = session_config["pipeline_path"] project_dir = session_config["project_dir"] userdir_pvc = session_config["userdir_pvc"] session_type = session_type.value metadata = { "name": f"jupyter-eg-{session_uuid}", "labels": { "app": "jupyter-eg", "project_uuid": project_uuid, "session_uuid": session_uuid, }, } # Get user environment variables to pass to Jupyter kernels. try: user_defined_env_vars = utils.get_proj_pip_env_variables( project_uuid, pipeline_uuid) # NOTE: Don't allow users to specify change the `PATH` as it # could break user code execution. The `PATH` var is removed # when starting kernels through the jupyter-EG as well. user_defined_env_vars.pop("PATH", None) except Exception: user_defined_env_vars = {} process_env_whitelist = [ "ORCHEST_PIPELINE_UUID", "ORCHEST_PIPELINE_PATH", "ORCHEST_PROJECT_UUID", "ORCHEST_USERDIR_PVC", "ORCHEST_PROJECT_DIR", "ORCHEST_PIPELINE_FILE", "ORCHEST_HOST_GID", "ORCHEST_SESSION_UUID", "ORCHEST_SESSION_TYPE", "ORCHEST_GPU_ENABLED_INSTANCE", "ORCHEST_REGISTRY", ] process_env_whitelist.extend(list(user_defined_env_vars.keys())) process_env_whitelist = ",".join(process_env_whitelist) # Need to reference the ip because the local docker engine will # run the container, and if the image is missing it will prompt # a pull which will fail because the FQDN can't be resolved by # the local engine on the node. K8S_TODO: fix this. registry_ip = k8s_core_api.read_namespaced_service( _config.REGISTRY, _config.ORCHEST_NAMESPACE).spec.cluster_ip environment = { "EG_MIRROR_WORKING_DIRS": "True", "EG_LIST_KERNELS": "True", "EG_KERNEL_WHITELIST": "[]", "EG_PROHIBITED_UIDS": "[]", "EG_UNAUTHORIZED_USERS": '["dummy"]', "EG_UID_BLACKLIST": '["-1"]', "EG_ALLOW_ORIGIN": "*", "EG_BASE_URL": f"/jupyter-server-{session_uuid}", # This is because images might need to be pulled on the node and # we aren't using a dameon or similar to pull images on the # node. See kernel-image-puller (KIP) for such an example. "EG_KERNEL_LAUNCH_TIMEOUT": "600", "EG_ENV_PROCESS_WHITELIST": process_env_whitelist, # Note: the docs say to use a string, but the script in charge # of launching the kernel will expect an integer and fail!. "EG_LOG_LEVEL": "10", # "All kernels reside in the EG namespace if true, otherwise # KERNEL_NAMESPACE must be provided or one will be created for # each kernel." "EG_NAMESPACE": _config.ORCHEST_NAMESPACE, "EG_SHARED_NAMESPACE": "True", "ORCHEST_PIPELINE_UUID": pipeline_uuid, "ORCHEST_PIPELINE_PATH": _config.PIPELINE_FILE, "ORCHEST_PROJECT_UUID": project_uuid, "ORCHEST_USERDIR_PVC": userdir_pvc, "ORCHEST_PROJECT_DIR": project_dir, "ORCHEST_PIPELINE_FILE": pipeline_path, "ORCHEST_HOST_GID": os.environ.get("ORCHEST_HOST_GID"), "ORCHEST_SESSION_UUID": session_uuid, "ORCHEST_SESSION_TYPE": session_type, "ORCHEST_GPU_ENABLED_INSTANCE": str(CONFIG_CLASS.GPU_ENABLED_INSTANCE), "ORCHEST_REGISTRY": registry_ip, } environment = [{"name": k, "value": v} for k, v in environment.items()] user_defined_env_vars = [{ "name": key, "value": value } for key, value in user_defined_env_vars.items()] environment.extend(user_defined_env_vars) volumes_dict, volume_mounts_dict = _get_jupyter_volumes_and_volume_mounts( project_uuid, userdir_pvc, project_dir, pipeline_path) deployment_manifest = { "apiVersion": "apps/v1", "kind": "Deployment", "metadata": metadata, "spec": { "replicas": 1, "selector": { "matchLabels": metadata["labels"] }, "template": { "metadata": metadata, "spec": { "securityContext": { "runAsUser": 0, "runAsGroup": int(os.environ.get("ORCHEST_HOST_GID")), "fsGroup": int(os.environ.get("ORCHEST_HOST_GID")), }, "serviceAccount": f"jupyter-eg-sa-{session_uuid}", "serviceAccountName": f"jupyter-eg-sa-{session_uuid}", "terminationGracePeriodSeconds": 5, "resources": { "requests": { "cpu": _config.USER_CONTAINERS_CPU_SHARES } }, "volumes": [ volumes_dict["userdir-pvc"], ], "containers": [{ "name": metadata["name"], "image": ("orchest/jupyter-enterprise-gateway:" + CONFIG_CLASS.ORCHEST_VERSION), "imagePullPolicy": "IfNotPresent", "env": environment, "volumeMounts": [ volume_mounts_dict["kernelspec"], ], "ports": [{ "containerPort": 8888 }], }], }, }, }, } service_manifest = { "apiVersion": "v1", "kind": "Service", "metadata": metadata, "spec": { "type": "ClusterIP", "selector": metadata["labels"], "ports": [{ "port": 8888 }], }, } return deployment_manifest, service_manifest