def __init__(self): # IF FALSE - we will not modify decorated @task code self._disabled = environ_enabled(ENV_DBND__DISABLED, False) self.unit_test_mode = environ_enabled(ENV_DBND__UNITTEST_MODE) self.max_calls_per_run = environ_int(DBND_MAX_CALLS_PER_RUN, DEFAULT_MAX_CALLS_PER_RUN) self.shell_cmd_complete_mode = ENV_SHELL_COMPLETION in os.environ self.quiet_mode = (os.environ.pop(ENV_DBND_QUIET, None) is not None or self.shell_cmd_complete_mode) self.is_no_modules = environ_enabled(ENV_DBND__NO_MODULES) self.disable_pluggy_entrypoint_loading = environ_enabled( ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING) self.is_sigquit_handler_on = environ_enabled( ENV_DBND__SHOW_STACK_ON_SIGQUIT) self._verbose = environ_enabled(ENV_DBND__VERBOSE) self._dbnd_tracking = environ_enabled(ENV_DBND__TRACKING) self._airflow_context = False self._inline_tracking = None self.disable_inline = False self.airflow_auto_tracking = environ_enabled(ENV_DBND__AUTO_TRACKING, default=True)
def __init__(self): # IF FALSE - we will not modify decorated @task code self._disabled = environ_enabled(ENV_DBND__DISABLED, False) self.unit_test_mode = environ_enabled(ENV_DBND__UNITTEST_MODE) self.max_calls_per_run = environ_int(DBND_MAX_CALLS_PER_RUN, DEFAULT_MAX_CALLS_PER_RUN) self.shell_cmd_complete_mode = ENV_SHELL_COMPLETION in os.environ self.quiet_mode = (os.environ.pop(ENV_DBND_QUIET, None) is not None or self.shell_cmd_complete_mode) # external process can create "wrapper run" (airflow scheduler) # a run with partial information, # when we have a subprocess, only nested run will have all actual details # so we are going to "resubmit" them self.resubmit_run = (DBND_RESUBMIT_RUN in os.environ and os.environ.pop(DBND_RESUBMIT_RUN) == "true") self.is_no_modules = environ_enabled(ENV_DBND__NO_MODULES) self.disable_pluggy_entrypoint_loading = environ_enabled( ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING) self.is_sigquit_handler_on = ( environ_enabled(ENV_DBND__SHOW_STACK_ON_SIGQUIT) and not self.unit_test_mode) self._verbose = environ_enabled(ENV_DBND__VERBOSE) self._dbnd_tracking = environ_enabled(ENV_DBND__TRACKING, default=None) self._airflow_context = False self._inline_tracking = None self.disable_inline = False self.airflow_auto_tracking = environ_enabled(ENV_DBND__AUTO_TRACKING, default=True)
def __init__(self, task_run): super(TaskRunLogManager, self).__init__(task_run) self.local_log_file = self.task_run.local_task_run_root.partition( name="%s.log" % task_run.attempt_number ) # type: FileTarget if environ_enabled("DBND__LOG_SPARK"): self.local_spark_log_file = self.task_run.local_task_run_root.partition( name="%s-spark.log" % task_run.attempt_number ) # type: FileTarget else: self.local_spark_log_file = None self.remote_log_file = None if not isinstance(self.task.task_env, LocalEnvConfig): self.remote_log_file = self.task_run.attempt_folder.partition( name=str(task_run.attempt_number), config=TargetConfig().as_file().txt, extension=".log", ) # type: FileTarget # file handler for task log # if set -> we are in the context of capturing self._log_task_run_into_file_active = False
def get_dags_from_databand(custom_operator_class: Optional[type] = None): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None from dbnd._core.errors.base import DatabandApiError, DatabandConnectionException try: # let be sure that we are loaded config.load_system_configs() if not config.get("core", "databand_url"): return {} default_retries = config.getint("scheduler", "default_retries") dags = DbndSchedulerDBDagsProvider( default_retries=default_retries, custom_operator_class=custom_operator_class).get_dags() if not in_quiet_mode(): logger.info("providing %s dags from scheduled jobs" % len(dags)) return {dag.dag_id: dag for dag in dags} except (DatabandConnectionException, DatabandApiError) as e: logger.error(str(e)) raise e except Exception as e: logging.exception("Failed to get dags form databand server") raise e
def detach_spark_logger(spark_log_file): if environ_enabled("DBND__LOG_SPARK"): try: log4j, spark_logger = try_get_spark_logger() if log4j is None: return spark_logger.removeAppender(spark_log_file.path) except Exception as task_ex: logger.warning("Failed to detach spark logger for log %s: %s", spark_log_file, task_ex)
def safe_tabulate(tabular_data, headers, **kwargs): terminal_columns, _ = get_terminal_size() # fancy_grid format has utf-8 characters (in corners of table) # cp1252 fails to encode that fancy_grid = not windows_compatible_mode and not environ_enabled( ENV_DBND__NO_TABLES ) tablefmt = "fancy_grid" if fancy_grid else "grid" table = tabulate(tabular_data, headers=headers, tablefmt=tablefmt, **kwargs) if table and max(map(len, table.split())) >= terminal_columns: table = tabulate(tabular_data, headers=headers, tablefmt="plain", **kwargs) return table
def read_log_body(self): try: spark_log_file = ( self.local_spark_log_file.path if environ_enabled("DBND__LOG_SPARK") else None ) return read_dbnd_log_preview(self.local_log_file.path, spark_log_file) except Exception as ex: logger.warning( "Failed to read log (%s) for %s: %s", self.local_log_file.path, self.task, ex, ) return None
def get_dags(): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None from dbnd._core.errors.base import DatabandConnectionException, DatabandApiError try: # let be sure that we are loaded config.load_system_configs() dags = DbndSchedulerDBDagsProvider().get_dags() if not in_quiet_mode(): logger.info("providing %s dags from scheduled jobs" % len(dags)) return dags except (DatabandConnectionException, DatabandApiError) as e: logger.error(str(e)) except Exception as e: raise e
def attach_spark_logger(spark_log_file): if environ_enabled("DBND__LOG_SPARK"): try: log4j, spark_logger = try_get_spark_logger() if log4j is None: return pattern = "[%d] {%c,%C{1}} %p - %m%n" file_appender = log4j.FileAppender() file_appender.setFile(spark_log_file.path) file_appender.setName(spark_log_file.path) file_appender.setLayout(log4j.PatternLayout(pattern)) file_appender.setThreshold(log4j.Priority.toPriority("INFO")) file_appender.activateOptions() spark_logger.addAppender(file_appender) except Exception as task_ex: logger.warning("Failed to attach spark logger for log %s: %s", spark_log_file, task_ex)
def cleanup_after_run(self): # this run was submitted by task_run_async - we need to cleanup ourself if not environ_enabled(ENV_DBND_AUTO_REMOVE_POD): return if ENV_DBND_POD_NAME in environ and ENV_DBND_POD_NAMESPACE in environ: try: logger.warning( "Auto deleteing pod as accordingly to '%s' env variable" % ENV_DBND_AUTO_REMOVE_POD) kube_dbnd = self.build_kube_dbnd() kube_dbnd.delete_pod( name=environ[ENV_DBND_POD_NAME], namespace=environ[ENV_DBND_POD_NAMESPACE], ) except Exception as e: logger.warning("Tried to delete this pod but failed: %s" % e) else: logger.warning( "Auto deleting pod as set, but pod name and pod namespace is not defined" )
def get_dags_from_file(): if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD): return None try: # let be sure that we are loaded config.load_system_configs() config_file = config.get("scheduler", "config_file") if not config_file: logger.info("No dags file has been defined at scheduler.config_file") return {} default_retries = config.getint("scheduler", "default_retries") active_by_default = config.getboolean("scheduler", "active_by_default") dags = DbndAirflowDagsProviderFromFile( config_file=config_file, active_by_default=active_by_default, default_retries=default_retries, ).get_dags() return {dag.dag_id: dag for dag in dags} except Exception as e: logging.exception("Failed to get dags from the file") raise e
def should_fix_pyspark_imports(): return environ_enabled(ENV_DBND_FIX_PYSPARK_IMPORTS)
def spark_tracking_enabled(): return environ_enabled("DBND__ENABLE__SPARK_CONTEXT_ENV")
ENV_DBND__ENV_IMAGE = "DBND__ENV_IMAGE" ENV_DBND__CORE__PLUGINS = "DBND__CORE__PLUGINS" ENV_SHELL_COMPLETION = "_DBND_COMPLETE" ENV_DBND_FIX_PYSPARK_IMPORTS = "DBND__FIX_PYSPARK_IMPORTS" ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING = "DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING" ENV_DBND__AUTO_TRACKING = "DBND__AUTO_TRACKING" DEFAULT_MAX_CALLS_PER_RUN = 100 ENV_DBND_TRACKING_ATTEMPT_UID = "DBND__TRACKING_ATTEMPT_UID" ENV_DBND_SCRIPT_NAME = "DBND__SCRIPT_NAME" _DBND_DEBUG_INIT = environ_enabled(ENV_DBND__DEBUG_INIT) _databand_package = relative_path(__file__, "..", "..") def is_databand_enabled(): return not get_dbnd_project_config().disabled def disable_databand(): get_dbnd_project_config().disabled = True def set_dbnd_unit_test_mode(): set_on(ENV_DBND__UNITTEST_MODE) # bypass to subprocess get_dbnd_project_config().unit_test_mode = True
def disable_airflow_subdag_tracking(): return environ_enabled( environ_config.ENV_DBND__DISABLE_AIRFLOW_SUBDAG_TRACKING, False )