class QuboleConfig(SparkEngineConfig): """Databricks cloud for Apache Spark """ _conf__task_family = "qubole" cluster_type = SparkClusters.qubole cloud = parameter(default="AWS", description="cloud") api_url = parameter(default="https://us.qubole.com/api").help( "API URL without version. like:'https://<ENV>.qubole.com/api'")[str] api_token = parameter.help("API key of qubole account")[str] cluster_label = parameter().help( "the label of the cluster to run the command on")[str] status_polling_interval_seconds = parameter(default=10).help( "seconds to sleep between polling databricks for job status.")[int] show_spark_log = parameter( default=True).help("if True full spark log will be printed.")[bool] qds_sdk_logging_level = parameter( default=logging.WARNING).help("qubole sdk log level.") def get_spark_ctrl(self, task_run): from dbnd_qubole.qubole import QuboleCtrl return QuboleCtrl(task_run=task_run)
class ApacheBeamPythonTask(_BeamTask): py_file = parameter.help("The application that submitted as *.py file").none[str] py_options = parameter(empty_default=True)[List[str]] def _task_submit(self): return self._get_job_ctrl().run_cmd_python( py_file=self.py_file, py_options=self.py_options )
class KubernetesEngineConfig(ContainerEngineConfig): _conf__task_family = "kubernetes" cluster_context = parameter.none().help("Kubernetes cluster context")[str] config_file = parameter.none().help("Custom Kubernetes config file")[str] in_cluster = parameter(default=False)[bool] image_pull_policy = parameter.value( "IfNotPresent", description="Kubernetes image_pull_policy flag") image_pull_secrets = parameter.none().help( "Secret to use for image pull")[str] keep_finished_pods = parameter( default=False).help("Don't delete pods on completion")[bool] keep_failed_pods = parameter( default=False).help("Don't delete failed pods")[bool] namespace = parameter(default="default")[str] secrets = parameter(empty_default=True).help( "User secrets to be added to every created pod")[List] system_secrets = parameter(empty_default=True).help( "System secrets (used by Databand Framework)")[List] env_vars = parameter(empty_default=True)[Dict] node_selectors = parameter(empty_default=True)[Dict] annotations = parameter(empty_default=True)[Dict] pods_creation_batch_size = parameter.value(10)[int] service_account_name = parameter.none()[str] gcp_service_account_keys = parameter.none()[ str] # it's actually dict, but KubeConf expects str affinity = parameter(empty_default=True)[Dict] tolerations = parameter(empty_default=True)[List] hostnetwork = parameter.value(False) configmaps = parameter(empty_default=True)[List[str]] volumes = parameter.none()[List[str]] volume_mounts = parameter.none()[List[str]] security_context = parameter.none()[List[str]] labels = parameter.none()[Dict] request_memory = parameter.none()[str] request_cpu = parameter.none()[str] limit_memory = parameter.none()[str] limit_cpu = parameter.none()[str] requests = parameter.none()[Dict] limits = parameter.none()[Dict] pod_exit_code_to_retry_count = parameter(empty_default=True).help( "Mapping between pod exit code to amount of pod retry attempts")[Dict] pod_retry_delay = parameter.help( "The delay between each pod retry attempt in time delta format. 1m, 5s, 1h, etc." )[datetime.timedelta] retry_on_image_pull_error_count = parameter.help( "Describes the amount of retry attempts when a pod fails with " "'ErrImagePull'").default(0)[int] startup_timeout_seconds = parameter.value(120) show_pod_log = parameter(default=False).help( "When using this engine as the task_engine, run tasks sequentially and stream their logs" )[bool] debug = parameter(default=False).help( "Equalent to show_pod_log=True + show all debug information")[bool] prefix_remote_log = parameter(default=True).help( "Adds [driver] or [<task_name>] prefix to logs streamed from Kubernetes to the local log" ) check_unschedulable_condition = parameter(default=True).help( "Try to detect non-transient issues that prevent the pod from being scheduled and fail the run if needed" ) check_cluster_resource_capacity = parameter(default=True).help( "When a pod can't be scheduled due to cpu or memory constraints, check if the constraints are possible to satisfy in the cluster" ) startup_timeout = parameter(default="10m").help( "Time to wait for pod getting into Running state")[datetime.timedelta] dashboard_url = parameter(default=None).help( "skeleton url to display as kubernetes dashboard")[str] pod_log_url = parameter( default=None).help("skeleton url to display logs of pods")[str] pod_yaml = parameter(default="${DBND_LIB}/conf/kubernetes-pod.yaml").help( "Base yaml to use to run databand task/driver")[str] trap_exit_file_flag = parameter(default=None).help("trap exit file")[str] auto_remove = parameter( default=False, description="Auto-removal of the pod when container has finished.", )[bool] detach_run = parameter( default=False, description="Submit run only, do not wait for it completion.")[bool] submit_termination_grace_period = parameter( description="timedelta to let the submitted pod enter a final state")[ datetime.timedelta] def _initialize(self): super(KubernetesEngineConfig, self)._initialize() if self.debug: logger.warning( "Running in debug mode, setting all k8s loggers to debug, waiting for every pod completion!" ) import airflow.contrib.kubernetes set_module_logging_to_debug( [dbnd_docker, airflow.contrib.kubernetes]) self.detach_run = False if self.show_pod_log: logger.warning( "Showing pod logs at runtime, waiting for every pod completion!" ) self.detach_run = False if self.auto_remove and not self.detach_run: logger.warning( "Can't auto remove pod if not running from detach_run=True mode, " "switching to auto_remove=False") self.auto_remove = False def get_docker_ctrl(self, task_run): from dbnd_docker.kubernetes.kubernetes_task_run_ctrl import ( KubernetesTaskRunCtrl, ) return KubernetesTaskRunCtrl(task_run=task_run) def submit_to_engine_task(self, env, task_name, args, interactive=True): docker_engine = self if not interactive: docker_engine = docker_engine.clone(auto_remove=True, detach_run=True) return DockerRunTask( task_name=task_name, command=subprocess.list2cmdline(args), image=self.full_image, docker_engine=docker_engine, task_is_system=True, ) def cleanup_after_run(self): # this run was submitted by task_run_async - we need to cleanup ourself if not environ_enabled(ENV_DBND_AUTO_REMOVE_POD): return if ENV_DBND_POD_NAME in environ and ENV_DBND_POD_NAMESPACE in environ: try: logger.warning( "Auto deleteing pod as accordingly to '%s' env variable" % ENV_DBND_AUTO_REMOVE_POD) kube_dbnd = self.build_kube_dbnd() kube_dbnd.delete_pod( name=environ[ENV_DBND_POD_NAME], namespace=environ[ENV_DBND_POD_NAMESPACE], ) except Exception as e: logger.warning("Tried to delete this pod but failed: %s" % e) else: logger.warning( "Auto deleting pod as set, but pod name and pod namespace is not defined" ) def get_dashboard_link(self, pod): if not self.dashboard_url: return None try: return self.dashboard_url.format(namespace=pod.namespace, pod=pod.name) except Exception: logger.exception("Failed to generate dashboard url from %s" % self.dashboard_url) return None def get_pod_log_link(self, pod): if not self.pod_log_url: return None try: return self.pod_log_url.format( namespace=pod.namespace, pod=pod.name, timestamp=datetime.datetime.now().isoformat(), ) except Exception: logger.exception("Internal error on generating pod log url") return None def get_kube_client(self, in_cluster=None): from kubernetes import config, client if in_cluster is None: in_cluster = self.in_cluster if in_cluster: config.load_incluster_config() else: config.load_kube_config(config_file=self.config_file, context=self.cluster_context) if PY2: # For connect_get_namespaced_pod_exec from kubernetes.client import Configuration configuration = Configuration() configuration.assert_hostname = False Configuration.set_default(configuration) return client.CoreV1Api() def build_kube_dbnd(self, in_cluster=None): from dbnd_docker.kubernetes.kube_dbnd_client import DbndKubernetesClient from kubernetes.config import ConfigException try: kube_client = self.get_kube_client(in_cluster=in_cluster) except ConfigException as e: raise friendly_error.executor_k8s.failed_to_connect_to_cluster( self.in_cluster, e) kube_dbnd = DbndKubernetesClient(kube_client=kube_client, engine_config=self) return kube_dbnd def get_pod_name(self, task_run, try_number): pod_name = task_run.job_id__dns1123 if try_number is not None: pod_name = "%s-%s" % (pod_name, try_number) return pod_name def build_pod( self, task_run, cmds, args=None, labels=None, try_number=None, include_system_secrets=False, ): # type: (TaskRun, List[str], Optional[List[str]], Optional[Dict[str,str]], Optional[int]) ->Pod pod_name = self.get_pod_name(task_run=task_run, try_number=try_number) image = self.full_image labels = combine_mappings(labels, self.labels) labels["dbnd_run_uid"] = clean_job_name_dns1123( str(task_run.run.run_uid)) labels["dbnd_task_run_uid"] = clean_job_name_dns1123( str(task_run.task_run_uid)) labels[ "dbnd"] = "task_run" # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>) annotations = self.annotations.copy() if self.gcp_service_account_keys: annotations[ "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys annotations["dbnd_tracker"] = task_run.task_tracker_url from dbnd_docker.kubernetes.dbnd_extended_resources import DbndExtendedResources resources = DbndExtendedResources( requests=self.requests, limits=self.limits, request_memory=self.request_memory, request_cpu=self.request_cpu, limit_memory=self.limit_memory, limit_cpu=self.limit_cpu, ) env_vars = { ENV_DBND_POD_NAME: pod_name, ENV_DBND_POD_NAMESPACE: self.namespace, ENV_DBND_USER: task_run.task_run_env.user, ENV_DBND__ENV_IMAGE: image, ENV_DBND_ENV: task_run.run.env.task_name, ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace), } if self.auto_remove: env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True" env_vars[self._params.get_param_env_key("in_cluster")] = "True" env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True" env_vars[ "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version # we want that all next runs will be able to use the image that we have in our configuration env_vars.update( self._params.to_env_map("container_repository", "container_tag")) env_vars.update(self.env_vars) env_vars.update(task_run.run.get_context_spawn_env()) secrets = self.get_secrets( include_system_secrets=include_system_secrets) from airflow.contrib.kubernetes.pod import Pod if self.trap_exit_file_flag: args = [ textwrap.dedent(""" trap "touch {trap_file}" EXIT {command} """.format( trap_file=self.trap_exit_file_flag, command=subprocess.list2cmdline(cmds), )) ] # we update cmd now cmds = ["/bin/bash", "-c"] if not self.container_tag: raise DatabandConfigError( "Your container tag is None, please check your configuration", help_msg="Container tag should be assigned", ) pod = Pod( namespace=self.namespace, name=pod_name, envs=env_vars, image=image, cmds=cmds, args=args, labels=labels, image_pull_policy=self.image_pull_policy, image_pull_secrets=self.image_pull_secrets, secrets=secrets, service_account_name=self.service_account_name, volumes=self.volumes, volume_mounts=self.volume_mounts, annotations=annotations, node_selectors=self.node_selectors, affinity=self.affinity, tolerations=self.tolerations, security_context=self.security_context, configmaps=self.configmaps, hostnetwork=self.hostnetwork, resources=resources, ) if self.pod_yaml: pod.pod_yaml = target(self.pod_yaml).read() return pod def get_secrets(self, include_system_secrets=True): """Defines any necessary secrets for the pod executor""" from airflow.contrib.kubernetes.secret import Secret result = [] if include_system_secrets: secrets = self.system_secrets + self.secrets else: secrets = self.secrets for secret_data in secrets: result.append( Secret( deploy_type=secret_data.get("type"), deploy_target=secret_data.get("target"), secret=secret_data.get("secret"), key=secret_data.get("key"), )) return result def build_kube_pod_req(self, pod): from dbnd_airflow.airflow_extensions.request_factory import ( DbndPodRequestFactory, ) self.apply_env_vars_to_pod(pod) kube_req_factory = DbndPodRequestFactory() if hasattr(pod, "pod_yaml"): kube_req_factory._yaml = pod.pod_yaml req = kube_req_factory.create(pod) return req def apply_env_vars_to_pod(self, pod): pod.envs["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True"
class ContainerEngineConfig(EngineConfig): require_submit = True dbnd_executable = [ "dbnd" ] # we should have 'dbnd' command installed in container container_repository = parameter( validator=NonEmptyString()).help("Docker container registry")[str] container_tag = parameter.none().help("Docker container tag")[VersionStr] container_tag_gpu = parameter.none().help( "Docker container tag for GPU tasks")[VersionStr] docker_build_tag = parameter.help("Auto build docker container tag").value( "dbnd_build") docker_build = parameter(default=True).help( "Automatically build docker image. " "If container_repository is unset it will be taken (along with the tag) from the docker build settings" )[bool] docker_build_push = parameter(default=True).help( "If docker_build is enabled, controls whether the image is automatically pushed or not" ) def get_docker_ctrl(self, task_run): pass @property def full_image(self): return "{}:{}".format(self.container_repository, self.container_tag) def prepare_for_run(self, run): # type: (DatabandRun) -> None super(ContainerEngineConfig, self).prepare_for_run(run) from dbnd_docker.submit_ctrl import prepare_docker_for_executor # when we run at submitter - we need to update driver_engine - this one will be used to send job # when we run at driver - we update task config, it will be used by task # inside pod submission the fallback is always on task_engine prepare_docker_for_executor(run, self) def submit_to_engine_task(self, env, task_name, args, interactive=True): from dbnd_docker.docker.docker_task import DockerRunTask submit_task = DockerRunTask( task_name=task_name, command=subprocess.list2cmdline(args), image=self.full_image, docker_engine=self, task_is_system=True, ) return submit_task def _should_wrap_with_submit_task(self, task_run): """ We don't want to resubmit if it's dockerized run and we running with the same engine """ from dbnd_docker.docker.docker_task import DockerRunTask if isinstance(task_run.task, DockerRunTask): if task_run.task.docker_engine.task_name == self.task_name: return False return super(ContainerEngineConfig, self)._should_wrap_with_submit_task(task_run)