Beispiel #1
0
    def __init__(self):
        # IF FALSE  - we will not modify decorated @task code
        self._disabled = environ_enabled(ENV_DBND__DISABLED, False)
        self.unit_test_mode = environ_enabled(ENV_DBND__UNITTEST_MODE)

        self.max_calls_per_run = environ_int(DBND_MAX_CALLS_PER_RUN,
                                             DEFAULT_MAX_CALLS_PER_RUN)

        self.shell_cmd_complete_mode = ENV_SHELL_COMPLETION in os.environ
        self.quiet_mode = (os.environ.pop(ENV_DBND_QUIET, None) is not None
                           or self.shell_cmd_complete_mode)

        self.is_no_modules = environ_enabled(ENV_DBND__NO_MODULES)
        self.disable_pluggy_entrypoint_loading = environ_enabled(
            ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING)
        self.is_sigquit_handler_on = environ_enabled(
            ENV_DBND__SHOW_STACK_ON_SIGQUIT)

        self._verbose = environ_enabled(ENV_DBND__VERBOSE)

        self._dbnd_tracking = environ_enabled(ENV_DBND__TRACKING)

        self._airflow_context = False
        self._inline_tracking = None

        self.disable_inline = False
        self.airflow_auto_tracking = environ_enabled(ENV_DBND__AUTO_TRACKING,
                                                     default=True)
Beispiel #2
0
    def __init__(self):
        # IF FALSE  - we will not modify decorated @task code
        self._disabled = environ_enabled(ENV_DBND__DISABLED, False)
        self.unit_test_mode = environ_enabled(ENV_DBND__UNITTEST_MODE)

        self.max_calls_per_run = environ_int(DBND_MAX_CALLS_PER_RUN,
                                             DEFAULT_MAX_CALLS_PER_RUN)

        self.shell_cmd_complete_mode = ENV_SHELL_COMPLETION in os.environ
        self.quiet_mode = (os.environ.pop(ENV_DBND_QUIET, None) is not None
                           or self.shell_cmd_complete_mode)
        # external process can create "wrapper run"  (airflow scheduler)
        # a run with partial information,
        # when we have a subprocess,  only nested run will have all actual details
        # so we are going to "resubmit" them
        self.resubmit_run = (DBND_RESUBMIT_RUN in os.environ
                             and os.environ.pop(DBND_RESUBMIT_RUN) == "true")

        self.is_no_modules = environ_enabled(ENV_DBND__NO_MODULES)
        self.disable_pluggy_entrypoint_loading = environ_enabled(
            ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING)
        self.is_sigquit_handler_on = (
            environ_enabled(ENV_DBND__SHOW_STACK_ON_SIGQUIT)
            and not self.unit_test_mode)

        self._verbose = environ_enabled(ENV_DBND__VERBOSE)

        self._dbnd_tracking = environ_enabled(ENV_DBND__TRACKING, default=None)

        self._airflow_context = False
        self._inline_tracking = None

        self.disable_inline = False
        self.airflow_auto_tracking = environ_enabled(ENV_DBND__AUTO_TRACKING,
                                                     default=True)
Beispiel #3
0
    def __init__(self, task_run):
        super(TaskRunLogManager, self).__init__(task_run)

        self.local_log_file = self.task_run.local_task_run_root.partition(
            name="%s.log" % task_run.attempt_number
        )  # type: FileTarget

        if environ_enabled("DBND__LOG_SPARK"):
            self.local_spark_log_file = self.task_run.local_task_run_root.partition(
                name="%s-spark.log" % task_run.attempt_number
            )  # type: FileTarget
        else:
            self.local_spark_log_file = None

        self.remote_log_file = None
        if not isinstance(self.task.task_env, LocalEnvConfig):
            self.remote_log_file = self.task_run.attempt_folder.partition(
                name=str(task_run.attempt_number),
                config=TargetConfig().as_file().txt,
                extension=".log",
            )  # type: FileTarget

        # file handler for task log
        # if set -> we are in the context of capturing
        self._log_task_run_into_file_active = False
def get_dags_from_databand(custom_operator_class: Optional[type] = None):
    if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD):
        return None
    from dbnd._core.errors.base import DatabandApiError, DatabandConnectionException

    try:

        # let be sure that we are loaded
        config.load_system_configs()
        if not config.get("core", "databand_url"):
            return {}

        default_retries = config.getint("scheduler", "default_retries")

        dags = DbndSchedulerDBDagsProvider(
            default_retries=default_retries,
            custom_operator_class=custom_operator_class).get_dags()

        if not in_quiet_mode():
            logger.info("providing %s dags from scheduled jobs" % len(dags))
        return {dag.dag_id: dag for dag in dags}
    except (DatabandConnectionException, DatabandApiError) as e:
        logger.error(str(e))
        raise e
    except Exception as e:
        logging.exception("Failed to get dags form databand server")
        raise e
Beispiel #5
0
def detach_spark_logger(spark_log_file):
    if environ_enabled("DBND__LOG_SPARK"):
        try:
            log4j, spark_logger = try_get_spark_logger()
            if log4j is None:
                return

            spark_logger.removeAppender(spark_log_file.path)
        except Exception as task_ex:
            logger.warning("Failed to detach spark logger for log %s: %s",
                           spark_log_file, task_ex)
Beispiel #6
0
def safe_tabulate(tabular_data, headers, **kwargs):
    terminal_columns, _ = get_terminal_size()
    # fancy_grid format has utf-8 characters (in corners of table)
    # cp1252 fails to encode that
    fancy_grid = not windows_compatible_mode and not environ_enabled(
        ENV_DBND__NO_TABLES
    )
    tablefmt = "fancy_grid" if fancy_grid else "grid"
    table = tabulate(tabular_data, headers=headers, tablefmt=tablefmt, **kwargs)
    if table and max(map(len, table.split())) >= terminal_columns:
        table = tabulate(tabular_data, headers=headers, tablefmt="plain", **kwargs)
    return table
Beispiel #7
0
    def read_log_body(self):
        try:
            spark_log_file = (
                self.local_spark_log_file.path
                if environ_enabled("DBND__LOG_SPARK")
                else None
            )
            return read_dbnd_log_preview(self.local_log_file.path, spark_log_file)

        except Exception as ex:
            logger.warning(
                "Failed to read log (%s) for %s: %s",
                self.local_log_file.path,
                self.task,
                ex,
            )
            return None
Beispiel #8
0
def get_dags():
    if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD):
        return None
    from dbnd._core.errors.base import DatabandConnectionException, DatabandApiError

    try:
        # let be sure that we are loaded
        config.load_system_configs()
        dags = DbndSchedulerDBDagsProvider().get_dags()

        if not in_quiet_mode():
            logger.info("providing %s dags from scheduled jobs" % len(dags))
        return dags
    except (DatabandConnectionException, DatabandApiError) as e:
        logger.error(str(e))
    except Exception as e:
        raise e
Beispiel #9
0
def attach_spark_logger(spark_log_file):
    if environ_enabled("DBND__LOG_SPARK"):
        try:
            log4j, spark_logger = try_get_spark_logger()
            if log4j is None:
                return

            pattern = "[%d] {%c,%C{1}} %p - %m%n"
            file_appender = log4j.FileAppender()

            file_appender.setFile(spark_log_file.path)
            file_appender.setName(spark_log_file.path)
            file_appender.setLayout(log4j.PatternLayout(pattern))
            file_appender.setThreshold(log4j.Priority.toPriority("INFO"))
            file_appender.activateOptions()
            spark_logger.addAppender(file_appender)
        except Exception as task_ex:
            logger.warning("Failed to attach spark logger for log %s: %s",
                           spark_log_file, task_ex)
Beispiel #10
0
 def cleanup_after_run(self):
     # this run was submitted by task_run_async - we need to cleanup ourself
     if not environ_enabled(ENV_DBND_AUTO_REMOVE_POD):
         return
     if ENV_DBND_POD_NAME in environ and ENV_DBND_POD_NAMESPACE in environ:
         try:
             logger.warning(
                 "Auto deleteing pod as accordingly to '%s' env variable" %
                 ENV_DBND_AUTO_REMOVE_POD)
             kube_dbnd = self.build_kube_dbnd()
             kube_dbnd.delete_pod(
                 name=environ[ENV_DBND_POD_NAME],
                 namespace=environ[ENV_DBND_POD_NAMESPACE],
             )
         except Exception as e:
             logger.warning("Tried to delete this pod but failed: %s" % e)
     else:
         logger.warning(
             "Auto deleting pod as set, but pod name and pod namespace is not defined"
         )
def get_dags_from_file():
    if environ_enabled(ENV_DBND_DISABLE_SCHEDULED_DAGS_LOAD):
        return None

    try:
        # let be sure that we are loaded
        config.load_system_configs()

        config_file = config.get("scheduler", "config_file")
        if not config_file:
            logger.info("No dags file has been defined at scheduler.config_file")
            return {}
        default_retries = config.getint("scheduler", "default_retries")
        active_by_default = config.getboolean("scheduler", "active_by_default")

        dags = DbndAirflowDagsProviderFromFile(
            config_file=config_file,
            active_by_default=active_by_default,
            default_retries=default_retries,
        ).get_dags()
        return {dag.dag_id: dag for dag in dags}
    except Exception as e:
        logging.exception("Failed to get dags from the file")
        raise e
Beispiel #12
0
def should_fix_pyspark_imports():
    return environ_enabled(ENV_DBND_FIX_PYSPARK_IMPORTS)
Beispiel #13
0
def spark_tracking_enabled():
    return environ_enabled("DBND__ENABLE__SPARK_CONTEXT_ENV")
Beispiel #14
0
ENV_DBND__ENV_IMAGE = "DBND__ENV_IMAGE"
ENV_DBND__CORE__PLUGINS = "DBND__CORE__PLUGINS"

ENV_SHELL_COMPLETION = "_DBND_COMPLETE"

ENV_DBND_FIX_PYSPARK_IMPORTS = "DBND__FIX_PYSPARK_IMPORTS"
ENV_DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING = "DBND__DISABLE_PLUGGY_ENTRYPOINT_LOADING"

ENV_DBND__AUTO_TRACKING = "DBND__AUTO_TRACKING"

DEFAULT_MAX_CALLS_PER_RUN = 100

ENV_DBND_TRACKING_ATTEMPT_UID = "DBND__TRACKING_ATTEMPT_UID"
ENV_DBND_SCRIPT_NAME = "DBND__SCRIPT_NAME"

_DBND_DEBUG_INIT = environ_enabled(ENV_DBND__DEBUG_INIT)
_databand_package = relative_path(__file__, "..", "..")


def is_databand_enabled():
    return not get_dbnd_project_config().disabled


def disable_databand():
    get_dbnd_project_config().disabled = True


def set_dbnd_unit_test_mode():
    set_on(ENV_DBND__UNITTEST_MODE)  # bypass to subprocess
    get_dbnd_project_config().unit_test_mode = True
def disable_airflow_subdag_tracking():
    return environ_enabled(
        environ_config.ENV_DBND__DISABLE_AIRFLOW_SUBDAG_TRACKING, False
    )