def _add_tasks_info(self, b): run = self.run reused = sum(tr.is_reused and not tr.is_skipped_as_not_required for tr in run.task_runs) optimizations = [] if reused: optimizations.append( "There are {completed} reused tasks.".format(completed=reused)) task_skipped_as_not_required = sum( tr.is_reused and tr.is_skipped_as_not_required for tr in run.task_runs) if task_skipped_as_not_required: optimizations.append( " {skipped} tasks are not required by any uncompleted task " "that is essential for your root task.".format( skipped=task_skipped_as_not_required)) show_more = is_verbose() task_runs = run.task_runs if not show_more: # show only non system task_runs = run.get_task_runs(without_system=True) if not task_runs: # we have only system task_runs = run.task_runs states = Counter(tr.task_run_state for tr in task_runs if tr.task_run_state) b.column("TOTAL TASKS", len(task_runs)) tasks = [(k.value, v) for k, v in states.items()] b.column_properties("STATES", tasks) if optimizations: b.column("RUN OPTIMIZATION", " ".join(optimizations)) return b
def get_project_git(): global _project_git_version if is_defined(_project_git_version): return _project_git_version _project_git_version = get_git_commit(project_path(), verbose=is_verbose()) return _project_git_version
def banner( self, msg, color=None, verbose=False, print_task_band=False, task_run=None, exc_info=None, ): try: b = TextBanner(msg, color) if verbose or is_verbose(): verbosity = FormatterVerbosity.HIGH else: verbosity = FormatterVerbosity.NORMAL builder = _TaskBannerBuilder( task=self.task, banner=b, verbosity=verbosity, print_task_band=print_task_band, ) return builder.build_banner(task_run=task_run, exc_info=exc_info).get_banner_str() except Exception as ex: log_exception( "Failed to calculate banner for '%s'" % self.task_id, ex, non_critical=True, ) return msg + (" ( task_id=%s)" % self.task_id)
def dbnd_log_debug(msg, *args, **kwargs): try: if is_verbose(): logger.info(msg, *args, **kwargs) else: logger.debug(msg, *args, **kwargs) except: print("Failed to print dbnd info message")
def dbnd_log_info_error(msg, *args, **kwargs): """we show exception only in verbose mode""" try: if is_verbose(): logger.exception(msg, *args, **kwargs) else: logger.info(msg, *args, **kwargs) except Exception: print("Failed to print dbnd error message")
def _handle_tracking_error(msg): if is_verbose(): logger.warning( "Failed during dbnd %s, ignoring, and continue without tracking" % msg, exc_info=True, ) else: logger.info( "Failed during dbnd %s, ignoring, and continue without tracking" % msg )
def _handle_dynamic_error(msg, func_call): if is_verbose(): logger.warning( "Failed during dbnd %s for %s, ignoring, and continue without tracking/orchestration" % (msg, func_call.task_cls), exc_info=True, ) else: logger.info( "Failed during dbnd %s for %s, ignoring, and continue without tracking" % (msg, func_call.task_cls))
def _handle_tracking_error(msg, func_call=None): log_exception_to_server() location = " for %s" % func_call.callable if func_call else "" msg = "Failed during dbnd %s for %s, ignoring, and continue without tracking" % ( msg, location, ) if is_verbose(): logger.warning(msg, exc_info=True) else: logger.info(msg)
def _run( self, kube_client: client.CoreV1Api, resource_version, worker_uuid, kube_config: Configuration, ): from kubernetes import watch watcher = watch.Watch() request_timeout = self.kube_dbnd.engine_config.watcher_request_timeout_seconds kwargs = { "label_selector": "airflow-worker={}".format(worker_uuid), "_request_timeout": (request_timeout, request_timeout), "timeout_seconds": self.kube_dbnd.engine_config.watcher_client_timeout_seconds, } if resource_version: kwargs["resource_version"] = resource_version if kube_config.kube_client_request_args: for key, value in kube_config.kube_client_request_args.items(): kwargs[key] = value for event in watcher.stream(kube_client.list_namespaced_pod, self.namespace, **kwargs): try: # DBND PATCH # we want to process the message task = event["object"] self.log.debug(" %s had an event of type %s", task.metadata.name, event["type"]) if event["type"] == "ERROR": return self.process_error(event) self._extended_process_state(event) self.resource_version = task.metadata.resource_version except Exception as e: msg = "Event: Exception raised on specific event: %s, Exception: %s" % ( event, e, ) if is_verbose(): self.log.exception(msg) else: self.log.warning(msg) return self.resource_version
def _create_temp_working_dir(tmp_build_dir=None): clean_build_dir = False try: if not tmp_build_dir: tmp_build_dir = mkdtemp(prefix="dbnd-build-") clean_build_dir = True yield tmp_build_dir finally: if clean_build_dir: if is_verbose(): # do not clean build dir in verbose mode logger.info("Keeping build dir because verbose mode is on") else: logger.info("Deleting tmp directory: %s", tmp_build_dir) shutil.rmtree(tmp_build_dir, ignore_errors=True)
def banner( self, msg, color=None, verbose=False, print_task_band=False, task_run=None, exc_info=None, ): task_id = self.task.task_id try: # Saving banner for testability self._banner = TextBanner(msg, color) if verbose or is_verbose(): verbosity = FormatterVerbosity.HIGH else: verbosity = FormatterVerbosity.NORMAL builder = _TaskBannerBuilder( task=self.task, banner=self._banner, verbosity=verbosity, print_task_band=print_task_band, ) # different banners for tracking and orchestration if TaskEssence.TRACKING.is_instance(self.task): builder.build_tracking_banner(task_run=task_run, exc_info=exc_info) else: if TaskEssence.CONFIG.is_instance(self.task): builder.build_config_banner() else: builder.build_orchestration_banner(task_run=task_run, exc_info=exc_info) return self._banner.get_banner_str() except Exception as ex: log_exception("Failed to calculate banner for '%s'" % task_id, ex, non_critical=True) return msg + (" ( task_id=%s)" % task_id)
def log_exception(msg, ex, logger_=None, verbose=None, non_critical=False): logger_ = logger_ or logger from dbnd._core.errors.base import DatabandError if verbose is None: verbose = is_verbose() if verbose: # just show the exception logger_.exception(msg) return if non_critical: logger_.info(msg + ": %s" % str(ex)) return if isinstance(ex, DatabandError): # msg = "{msg}:{ex}".format(msg=msg, ex=ex) logger_.error(msg + ": %s" % str(ex)) else: # should we? let's show the exception for now so we can debug logger_.exception(msg)
def __init__(self): super(ConsoleStore, self).__init__() self.max_log_value_len = 50 self.verbose = is_verbose() self.ascii_graph = Pyasciigraph()
def log_pod_events_on_sigterm(stack_frame): print_stack_trace(stack_frame) print_driver_events() print_cpu_memory_usage() if is_verbose(): print_dmesg()
def build_pod( self, task_run: TaskRun, cmds: List[str], args: Optional[List[str]] = None, labels: Optional[Dict[str, str]] = None, try_number: Optional[int] = None, include_system_secrets: bool = False, ) -> k8s.V1Pod: if not self.container_tag: raise DatabandConfigError( "Your container tag is None, please check your configuration", help_msg="Container tag should be assigned", ) pod_name = self.get_pod_name(task_run=task_run, try_number=try_number) image = self.full_image labels = combine_mappings(labels, self.labels) labels["pod_name"] = pod_name labels["dbnd_run_uid"] = task_run.run.run_uid labels["dbnd_task_run_uid"] = task_run.task_run_uid labels["dbnd_task_run_attempt_uid"] = task_run.task_run_attempt_uid labels[ "dbnd_task_family"] = task_run.task.task_definition.full_task_family_short labels["dbnd_task_name"] = task_run.task.task_name labels["dbnd_task_af_id"] = task_run.task_af_id # for easier pod deletion (kubectl delete pod -l dbnd=task_run -n <my_namespace>) if task_run.task.task_is_system: labels["dbnd"] = "dbnd_system_task_run" else: labels["dbnd"] = "task_run" # we need to be sure that the values meet the dns label names RFC # https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#dns-label-names labels = { label_name: clean_label_name_dns1123(str(label_value)) for label_name, label_value in six.iteritems(labels) } if is_verbose(): logger.info("Build pod with kubernetes labels {}".format(labels)) annotations = self.annotations.copy() if self.gcp_service_account_keys: annotations[ "iam.cloud.google.com/service-account"] = self.gcp_service_account_keys annotations["dbnd_tracker"] = task_run.task_tracker_url from dbnd_docker.kubernetes.vendorized_airflow.dbnd_extended_resources import ( DbndExtendedResources, ) resources = DbndExtendedResources( requests=self.requests, limits=self.limits, request_memory=self.request_memory, request_cpu=self.request_cpu, limit_memory=self.limit_memory, limit_cpu=self.limit_cpu, ) env_vars = { ENV_DBND_POD_NAME: pod_name, ENV_DBND_POD_NAMESPACE: self.namespace, ENV_DBND_USER: task_run.task_run_env.user, ENV_DBND__ENV_IMAGE: image, ENV_DBND_ENV: task_run.run.env.task_name, ENV_DBND__ENV_MACHINE: "%s at %s" % (pod_name, self.namespace), } if AIRFLOW_VERSION_2: env_vars[ "AIRFLOW__CORE__TASK_RUNNER"] = "dbnd_airflow.compat.dbnd_task_runner.DbndStandardTaskRunner" if self.auto_remove: env_vars[ENV_DBND_AUTO_REMOVE_POD] = "True" env_vars[self._params.get_param_env_key(self, "in_cluster")] = "True" env_vars["AIRFLOW__KUBERNETES__IN_CLUSTER"] = "True" env_vars[ "DBND__RUN_INFO__SOURCE_VERSION"] = task_run.run.context.task_run_env.user_code_version env_vars["AIRFLOW__KUBERNETES__DAGS_IN_IMAGE"] = "True" if not get_dbnd_project_config().is_tracking_mode(): env_vars[ENV_DBND__TRACKING] = "False" # we want that all next runs will be able to use the image that we have in our configuration env_vars.update( self._params.to_env_map(self, "container_repository", "container_tag")) env_vars.update(self.env_vars) env_vars.update(task_run.run.get_context_spawn_env()) secrets = self.get_secrets( include_system_secrets=include_system_secrets) if self.trap_exit_file_flag: args = [ textwrap.dedent(""" trap "touch {trap_file}" EXIT {command} """.format( trap_file=self.trap_exit_file_flag, command=subprocess.list2cmdline(cmds), )) ] # we update cmd now cmds = ["/bin/bash", "-c"] if self.debug_with_command: logger.warning( "%s replacing pod %s command with '%s', original command=`%s`", task_run, pod_name, self.debug_with_command, subprocess.list2cmdline(cmds), ) cmds = shlex.split(self.debug_with_command) base_pod = self._build_base_pod() pod = self._to_real_pod( cmds=cmds, args=args, namespace=self.namespace, name=pod_name, envs=env_vars, image=image, labels=labels, secrets=secrets, resources=resources, annotations=annotations, ) final_pod = reconcile_pods(base_pod, pod) return final_pod
def __init__(self): super(ConsoleStore, self).__init__() self.verbose = is_verbose()
def __init__(self, *args, **kwargs): super(ConsoleStore, self).__init__(*args, **kwargs) self.max_log_value_len = 50 self.verbose = is_verbose() self.ascii_graph = Pyasciigraph() self._is_in_airflow_tracking_mode = in_airflow_tracking_mode()
def run_airflow_dag(self, dag, session=None): # type: (DAG, Session) -> None af_dag = dag databand_run = self.run databand_context = databand_run.context execution_date = databand_run.execution_date s = databand_context.settings # type: DatabandSettings s_run = s.run # type: RunConfig run_id = s_run.id if not run_id: # we need this name, otherwise Airflow will try to manage our local jobs at scheduler # ..zombies cleanup and so on run_id = "backfill_{0}_{1}".format( databand_run.name, databand_run.execution_date.isoformat()) if self.airflow_config.disable_db_ping_on_connect: from airflow import settings as airflow_settings try: remove_listener_by_name(airflow_settings.engine, "engine_connect", "ping_connection") except Exception as ex: logger.warning("Failed to optimize DB access: %s" % ex) if isinstance(self.airflow_task_executor, InProcessExecutor): heartrate = 0 else: # we are in parallel mode heartrate = airflow_conf.getfloat("scheduler", "JOB_HEARTBEAT_SEC") # "Amount of time in seconds to wait when the limit " # "on maximum active dag runs (max_active_runs) has " # "been reached before trying to execute a dag run " # "again. delay_on_limit = 1.0 self._pickle_dag_and_save_pickle_id_for_versioned(af_dag, session=session) af_dag.sync_to_db(session=session) # let create relevant TaskInstance, so SingleDagRunJob will run them create_dagrun_from_dbnd_run( databand_run=databand_run, dag=af_dag, run_id=run_id, execution_date=execution_date, session=session, state=State.RUNNING, external_trigger=False, ) self.airflow_task_executor.fail_fast = s_run.fail_fast # we don't want to be stopped by zombie jobs/tasks airflow_conf.set("core", "dag_concurrency", str(10000)) airflow_conf.set("core", "max_active_runs_per_dag", str(10000)) job = SingleDagRunJob( dag=af_dag, execution_date=databand_run.execution_date, mark_success=s_run.mark_success, executor=self.airflow_task_executor, donot_pickle=(s_run.donot_pickle or airflow_conf.getboolean("core", "donot_pickle")), ignore_first_depends_on_past=s_run.ignore_first_depends_on_past, ignore_task_deps=s_run.ignore_dependencies, fail_fast=s_run.fail_fast, pool=s_run.pool, delay_on_limit_secs=delay_on_limit, verbose=s.system.verbose, heartrate=heartrate, airflow_config=self.airflow_config, ) # we need localDagJob to be available from "internal" functions # because of ti_state_manager use from dbnd._core.current import is_verbose with SingleDagRunJob.new_context(_context=job, allow_override=True, verbose=is_verbose()): job.run()
def _extended_process_state(self, event): """ check more types of events :param event: :return: """ pod_data = event["object"] pod_id = pod_data.metadata.name phase = pod_data.status.phase resource_version = pod_data.metadata.resource_version labels = pod_data.metadata.labels task_id = labels.get("task_id") event_msg = "Event from %s(%s)" % (pod_id, task_id) try: try_num = int(labels.get("try_number", "1")) if try_num > 1: event_msg += " (try %s)" % try_num except ValueError: pass _fail_event = get_tuple_for_watcher_queue(pod_id, self.namespace, State.FAILED, labels, resource_version) debug_phase = (self.kube_dbnd.engine_config.debug_phase ) # print only if user defined debug phase if is_verbose() or (debug_phase and phase == debug_phase): self.log.info( "Event verbose:%s %s %s: %s", pod_id, event_msg, event.get("type"), event.get("raw_object"), ) if event.get("type") == "DELETED" and phase not in { "Succeeded", "Failed" }: # from Airflow 2.0 -> k8s may delete pods (preemption?) self.log.info( "%s: pod has been deleted: phase=%s deletion_timestamp=%s", event_msg, phase, pod_data.metadata.deletion_timestamp, ) self.watcher_queue.put(_fail_event) elif pod_data.metadata.deletion_timestamp: self.log.info( "%s: pod is being deleted: phase=%s deletion_timestamp=%s ", event_msg, phase, pod_data.metadata.deletion_timestamp, ) self.watcher_queue.put(_fail_event) elif phase == "Pending": pod_ctrl = self.kube_dbnd.get_pod_ctrl( pod_id, namespace=pod_data.metadata.namespace) try: # now we only fail, we will use the same code to try to rerun at scheduler code pod_ctrl.check_deploy_errors(pod_data) self.log.info("%s: pod is Pending", event_msg) except Exception as ex: self.log.info( "Event: %s Pending: failing with %s", pod_id, str(ex), ) self.watcher_queue.put(_fail_event) elif phase == "Running": pod_ctrl = self.kube_dbnd.get_pod_ctrl( pod_id, namespace=pod_data.metadata.namespace) try: # now we only fail, we will use the same code to try to rerun at scheduler code pod_ctrl.check_running_errors(pod_data) self.log.info("%s: pod is Running", event_msg) self.watcher_queue.put( get_tuple_for_watcher_queue(pod_id, self.namespace, State.RUNNING, labels, resource_version)) except Exception as ex: self.log.info( "Event: %s Pending: failing with %s", pod_id, str(ex), ) self.watcher_queue.put(_fail_event) elif phase == "Failed": self.log.info("%s: pod has Failed", event_msg) self.watcher_queue.put(_fail_event) elif phase == "Succeeded": self.log.info("%s: pod has Succeeded", event_msg) self.watcher_queue.put( get_tuple_for_watcher_queue(pod_id, self.namespace, None, labels, resource_version)) else: self.log.warning( "Event: Invalid state: %s on pod: %s with labels: %s with " "resource_version: %s", phase, pod_id, labels, resource_version, )