Ejemplo n.º 1
0
class _BaseSparkTask(Task):
    _conf__task_type_name = TaskType.spark

    # will get it from env
    spark_config = parameter(
        config_path=from_task_env("spark_config"))[SparkConfig]
    spark_engine = parameter(config_path=from_task_env("spark_engine"))[
        SparkEngineConfig]  # type: Union[EmrConfig]

    python_script = None
    main_class = None

    def application_args(self):
        """
        'Arguments for the application being submitted'
        :return: list
        """
        return []

    def _get_spark_ctrl(self):
        # type: ()-> SparkCtrl
        return self.spark_engine.get_spark_ctrl(self.current_task_run)

    def _task_banner(self, banner, verbosity):
        b = banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.application_args()))
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self)
Ejemplo n.º 2
0
class EngineConfig(config.Config):
    """Databand's engine configuration (where tasks are executed)"""

    require_submit = parameter(
        description="Should the task engine be forced to submit tasks").value(
            False)

    dbnd_local_root = parameter(
        default=None,
        description="Local dbnd home directory at the engine environment"
    )[DirTarget]

    dbnd_executable = parameter(
        default=[sys.executable, "-m", "dbnd"],
        description="'dbnd' executable path at engine environment",
    )[typing.List[str]]

    def cleanup_after_run(self):
        pass

    def submit_to_engine_task(self, env, task_name, args, interactive=True):
        raise local_engine_not_accept_remote_jobs(self.env, self)

    def prepare_for_run(self, run):
        # type: (DatabandRun) -> None
        return

    def _should_wrap_with_submit_task(self, task_run):
        return self.require_submit
Ejemplo n.º 3
0
class OutputConfig(Config):
    """(Advanced) Databand's core task's output behaviour"""

    _conf__task_family = "output"

    path_task = parameter(description="default path for every Task")[str]
    path_prod_immutable_task = parameter(
        description=
        "format of the path to be used by Production Immutable tasks")[str]

    hdf_format = (parameter.choices([
        "table", "fixed"
    ]).help("Default format to save DataFrame to hdf").value("fixed"))

    deploy_id = parameter(
        default=VersionAlias.context_uid,
        description="deploy prefix to use for remote deployments",
    )[VersionStr]

    def get_value_target_config(self, value_type):
        # type: (Type) -> TargetConfig

        type_handler = get_value_type_of_type(value_type)
        for possible_option in [str(type_handler), type_handler.config_name]:
            config_value = config.get_config_value(
                section=self._conf__task_family, key=possible_option)
            if config_value:
                return parse_target_config(config_value.value)
        return file.pickle
Ejemplo n.º 4
0
class RunInfoConfig(config.Config):
    """(Advanced) Databand's run information gatherer"""

    _conf__task_family = "run_info"
    source_version = parameter(
        default="git", description="gather version control via git/None"
    )[VersionStr]
    user_data = parameter(default=None, description="UserData")[str]
    user = parameter(default=None, description="override user name with the value")[str]

    def build_task_run_info(self):
        task_run_env_uid = get_uuid()
        import dbnd

        logging.debug("Created new task run env with uid '%s'", task_run_env_uid)

        machine = environ.get(ENV_DBND__ENV_MACHINE, "")
        if environ.get(ENV_DBND__ENV_IMAGE, None):
            machine += " image=%s" % environ.get(ENV_DBND__ENV_IMAGE)
        return TaskRunEnvInfo(
            uid=task_run_env_uid,
            databand_version=dbnd.__version__,
            user_code_version=self.source_version,
            user_code_committed=True,
            cmd_line=subprocess.list2cmdline(sys.argv),
            user=self.user or dbnd_getuser(),
            machine=machine,
            project_root=project_path(),
            user_data=safe_string(self.user_data, max_value_len=500),
            heartbeat=utcnow(),
        )
Ejemplo n.º 5
0
class GitConfig(config.Config):
    """Databand's git validator"""

    _conf__task_family = "git"

    enforce_clean = parameter(
        description="Enforce project's git to be clean. Can be overridden by allow_dirty or --git-allow-dirty"
    ).value(False)

    allow_dirty = parameter(
        description="Permit git to be dirty when enforce_clean or --git-enforce-clean is on"
    ).value(False)

    def validate_git_policy(self):
        if not self.enforce_clean:
            return

        if is_git_dirty():
            if self.allow_dirty:
                logger.warning("Runing with not commited files")
                return

            raise DatabandBuildError(
                help_msg="Git workspace must be clean."
                "\nYou see this message because enforce_clean in git section is enabled."
                "\nTo temporarily disable this message use --git-allow-dirty."
            )
Ejemplo n.º 6
0
class SchedulerConfig(config.Config):
    """(Advanced) Databand's scheduler"""

    _conf__task_family = "scheduler"

    config_file = parameter(
        default=None,
        description="path to a file defining scheduled jobs to execute")[str]

    # by default the scheduler drop-in will decide whether to sync or not based on if it's running inside the scheduler or not (and not the websrver)
    # the next two params can be used to force it one way or the other
    never_file_sync = parameter(
        default=False,
        description="disable syncing the scheduler config_file to the db"
    )[bool]

    always_file_sync = parameter(
        default=False,
        description="force syncing the scheduler config_file to the db")[bool]

    no_ui_cli_edit = parameter(
        default=False,
        description=
        "disables creating, editing and deleting scheduled jobs from the cli and ui. Scheduled job definitions will"
        "only be taken from the scheduler config file",
    )

    refresh_interval = parameter(
        default=1,
        description=
        "interval to refresh the scheduled job list (from both the db and/or config file)",
    )[int]

    active_by_default = parameter(
        default=True,
        description="whether new scheduled jobs will be active by default"
    )[bool]

    default_retries = parameter(
        description=
        "number of times to retry a failed run, unless set to a different value on the scheduled job"
    )[int]

    dbnd_user = parameter(
        description="user used to connect to the dbnd web server")[str]

    dbnd_password = parameter(
        description="password used to connect to the dbnd web server")[str]

    shell_cmd = parameter(
        description=
        "If shell_cmd is True, the specified command will be executed through the shell. "
        "This can be useful if you are using Python primarily "
        "for the enhanced control flow it offers "
        "over most system shells and still want convenient access to other shell features "
        "such as shell pipes, filename wildcards, environment variable expansion, "
        "and expansion of ~ to a user's home directory.")[bool]
Ejemplo n.º 7
0
class DescribeConfig(Config):
    """(Advanced) Databand's --describe behaviour"""

    _conf__task_family = "describe"

    dry = parameter(
        default=False,
        description="Describe without pushing to databand-web")[bool]

    no_checks = parameter(
        default=False,
        description="Describe without doing copleteness and other checks"
    )[bool]
    no_tree = parameter(
        default=False, description="Describe without showing tasks tree")[bool]
Ejemplo n.º 8
0
class PySparkTask(_BaseSparkTask):
    _conf__task_type_name = TaskType.pyspark

    python_script = parameter(
        description="The application that submitted as a job *.py file")[str]

    def _task_submit(self):
        return self._get_spark_ctrl().run_pyspark(
            pyspark_script=self.python_script)
Ejemplo n.º 9
0
class DescribeConfig(Config):
    """(Advanced) Databand's --describe behaviour"""

    _conf__task_family = "describe"

    dry = parameter(
        default=False, description="Describe without pushing to databand-web"
    )[bool]

    no_checks = parameter(
        default=False, description="Describe without doing copleteness and other checks"
    )[bool]
    no_tree = parameter(
        default=False, description="Describe without showing tasks tree"
    )[bool]

    console_value_preview_size = parameter(
        description="Maximum length of string previewed in TaskVisualiser"
    )[int]
Ejemplo n.º 10
0
class GitConfig(config.Config):
    """Databand's git validator"""

    _conf__task_family = "git"

    enforce_clean = parameter(
        description="Enforce project's git to be clean. Can be overridden by allow_dirty or --git-allow-dirty"
    ).value(False)

    allow_dirty = parameter(
        description="Permit git to be dirty when enforce_clean or --git-enforce-clean is on"
    ).value(False)

    def _raise_enforce_clean_error(self, msg):
        help_text = (
            "\nYou see this message because enforce_clean in git section is enabled."
            "\nTo temporarily disable this message use --git-allow-dirty."
        )
        raise DatabandBuildError(help_msg=msg + help_text)

    def validate_git_policy(self):
        if not self.enforce_clean:
            return

        is_dirty = is_git_dirty()

        if is_dirty is False:
            return

        if is_dirty:
            if self.allow_dirty:
                logger.warning("Runing with not commited files.")
            else:
                self._raise_enforce_clean_error("Git repo must be clean.")
        else:  # is_dirty is None
            if self.allow_dirty:
                logger.warning("Failed to get git status.")
            else:
                self._raise_enforce_clean_error("Failed to get git status.")
Ejemplo n.º 11
0
def ingest_partner_data(
    data=parameter(log_histograms=True)[pd.DataFrame],
    name="customer",
    dedup_columns=None,
    columns_to_impute=None,
    pii_columns=None,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    pii_columns = pii_columns or ["name", "address", "phone"]
    dedup_columns = dedup_columns or ["phone"]
    columns_to_impute = columns_to_impute or ["10"]

    clean = clean_pii(data, pii_columns)

    enriched = enrich_missing_fields(clean, columns_to_impute)
    deduped = dedup_records(enriched, columns=dedup_columns)
    report = create_report(deduped)
    return report, deduped
Ejemplo n.º 12
0
class SparkTask(_BaseSparkTask):
    main_class = parameter(default=None,
                           description="The entry point for your application",
                           system=True)[str]

    def _task_submit(self):
        if not self.spark_config.main_jar:
            raise DatabandConfigError("main_jar is not configured for %s" %
                                      self)
        return self._get_spark_ctrl().run_spark(main_class=self.main_class)

    def run(self):
        # we don't actually have run function except for inline
        #
        # we don't want to read params automatically
        # most likely our run function just creates a command line to run on remote compute
        raise DatabandBuildError(
            "You should not call or override run functions")
Ejemplo n.º 13
0
def enrich_missing_fields(
    raw_data=parameter(log_histograms=True)[pd.DataFrame],
    columns_to_impute=None,
    columns_min_max_scaler=None,
    fill_with=0,
) -> pd.DataFrame:
    columns_to_impute = columns_to_impute or ["10"]
    columns_min_max_scaler = columns_min_max_scaler or []

    counter = int(raw_data[columns_to_impute].copy().isna().sum())
    noise = randint(-counter, counter)
    log_metric("Replaced NaNs",
               int(raw_data[columns_to_impute].copy().isna().sum()) + noise)
    raw_data[columns_to_impute] = raw_data[columns_to_impute].fillna(fill_with)

    for column_name in columns_min_max_scaler:
        scaler = preprocessing.MinMaxScaler()
        raw_data[column_name + "_norm"] = scaler.fit_transform(
            raw_data[[column_name]].values.astype(float))
    return raw_data
Ejemplo n.º 14
0
class _AirflowRuntimeTask(Task):
    task_is_system = True
    _conf__track_source_code = False

    dag_id = parameter[str]
    execution_date = parameter[datetime.datetime]

    is_dag = parameter(system=True).value(False)

    def _initialize(self):
        super(_AirflowRuntimeTask, self)._initialize()
        self.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId(self.dag_id)
        self.task_meta.task_functional_call = ""

        self.task_meta.task_command_line = generate_airflow_cmd(
            dag_id=self.dag_id,
            task_id=self.task_id,
            execution_date=self.execution_date,
            is_root_task=self.is_dag,
        )
Ejemplo n.º 15
0
class RunConfig(config.Config):
    """Databand's per run settings (e.g. execution date)"""

    _conf__task_family = "run"

    # on none generated at DatabandRun
    name = parameter.value(default=None, description="Specify run name")[str]

    description = parameter.value(default=None, description="Specify run description")[
        Optional[str]
    ]

    parallel = parameter(default=None)[bool]
    task_executor_type = parameter(
        default=None,
        description="Alternate executor type: "
        " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes,"
        "  see docs for more options",
    )[str]

    submit_driver = parameter(
        description="override env.submit_driver for specific environment"
    ).none[bool]
    submit_tasks = parameter(
        description="override env.submit_tasks for specific environment"
    ).none[bool]

    enable_airflow_kubernetes = parameter(
        default=True,
        description="Enable use of kubernetes executor for kubebernetes engine submission",
    )[bool]

    execution_date = parameter(default=None, description="Override execution date")[
        datetime
    ]

    # Execution specific
    id = parameter(default=None, description="The list of task ids to run")[List[str]]
    selected_tasks_regex = parameter(
        default=None, description="Run only specified tasks (regular expresion)"
    )[List[str]]

    ignore_dependencies = parameter(
        description="The regex to filter specific task_ids"
    ).value(False)
    ignore_first_depends_on_past = parameter(
        description="The regex to filter specific task_ids"
    ).value(False)

    pool = parameter(default=None, description="Resource pool to use")[str]

    donot_pickle = parameter(
        description="Do not attempt to pickle the DAG object to send over "
        "to the workers, just tell the workers to run their version "
        "of the code."
    ).value(False)

    mark_success = parameter(
        description="Mark jobs as succeeded without running them"
    ).value(False)
    skip_completed = parameter(
        description="Mark jobs as succeeded without running them"
    ).value(True)
    fail_fast = parameter(
        description="Skip all remaining tasks if a task has failed"
    ).value(True)
    enable_prod = parameter(description="Enable production tasks").value(False)
    is_archived = parameter(description="Save this run in the archive").value(False)

    heartbeat_interval_s = parameter(
        description="How often a run should send a heartbeat to the server. Set -1 to disable"
    )[int]
    heartbeat_timeout_s = parameter(
        description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable"
    )[int]
    heartbeat_sender_log_to_file = parameter(
        description="create a separate log file for the heartbeat sender and don't log the run process stdout"
    )[bool]
    open_web_tracker_in_browser = parameter(
        description="If True, open web tracker in browser during task run."
    ).value(False)

    enable_concurent_sqlite = parameter(
        description="Enable concurrent execution with sqlite db (use only for debug!)"
    ).value(False)

    interactive = parameter(
        default=False,
        description="When submitting driver to remote execution keep tracking of submitted process and wait for completion",
    )[bool]

    skip_completed_on_run = parameter(default=True).help(
        "Should dbnd task check that task is completed and mark it as resued on task execution"
    )[bool]

    validate_task_inputs = parameter(default=True).help(
        "Should dbnd task check that all input files exist"
    )[bool]

    validate_task_outputs = parameter(default=True).help(
        "Should dbnd task check that all outputs exist after task has been executed"
    )[bool]

    validate_task_outputs_on_build = parameter(default=False).help(
        "Should dbnd task check that there are no incomplete outputs before task executes"
    )[bool]

    tracking_with_cache = parameter(default=False).help(
        "Should dbnd cache results during tracking"
    )[bool]

    pipeline_band_only_check = parameter(default=False).help(
        "When checking if pipeline is completed, check only if the band file exist (skip the tasks)"
    )[bool]

    task_complete_parallelism_level = parameter(default=1).help(
        "Number of threads to use when checking if tasks are already complete"
    )[int]

    dry = parameter(default=False).help(
        "Do not execute tasks, stop before sending them to the execution, and print their status"
    )[bool]
Ejemplo n.º 16
0
class _BaseSparkTask(Task):
    _conf__task_type_name = TaskType.spark

    # will get it from env
    spark_config = parameter(
        config_path=from_task_env("spark_config"))[SparkConfig]
    spark_engine = parameter(config_path=from_task_env("spark_engine"))[
        SparkEngineConfig]  # type: Union[EmrConfig]

    python_script = None
    main_class = None

    spark_conf_extension = parameter(
        default={},
        description="This is an extension for SparkConfig.conf dict, "
        "every config added to this dict will be merged to spark_config.conf",
        significant=False,
    )[dict]

    spark_resources = parameter.c(default=None, system=True)[Dict[str,
                                                                  FileTarget]]

    def band(self):
        result = super(_BaseSparkTask, self).band()

        if self.spark_config.include_user_project:
            fat_wheel_task = fat_wheel_building_task(
                task_version=try_get_databand_context().current_context_uid,
                task_target_date="today",
            )
            self.spark_resources = {"user_project": fat_wheel_task}

        if self.spark_engine.disable_task_band:
            logger.debug("Task band is disabled due to disable_task_band flag")
            self.task_band = None

        return result

    def get_py_files(self):
        py_files = self.spark_config.py_files.copy()
        if self.spark_resources and "user_project" in self.spark_resources:
            project_files = self.spark_resources["user_project"].load(str)
            py_files.append(project_files)
        return py_files

    def application_args(self):
        """
        'Arguments for the application being submitted'
        :return: list
        """
        return []

    def _get_spark_ctrl(self):
        # type: ()-> SparkCtrl
        return self.spark_engine.get_spark_ctrl(self.current_task_run)

    def _task_banner(self, banner, verbosity):
        b = banner

        b.new_section()
        try:
            spark_command_line = subprocess.list2cmdline(
                list_of_strings(self.application_args()))
            b.column("SPARK CMD LINE", spark_command_line)
        except Exception:
            logger.exception("Failed to get spark command line from %s" % self)

    def get_root(self):
        return self.spark_engine.root or super(_BaseSparkTask, self).get_root()

    def _initialize(self):
        super(_BaseSparkTask, self)._initialize()

        if self.spark_conf_extension:
            # adds the last layer for SparkConfig.conf
            self.spark_config.conf.update(self.spark_conf_extension)
Ejemplo n.º 17
0
class LoggingConfig(config.Config):
    """Databand's logger configuration"""

    _conf__task_family = "log"
    disabled = parameter(description="Should logging be disabled").value(False)
    debug_log_config = parameter(
        description="Debug our logging configuration system").value(False)

    capture_stdout_stderr = parameter(
        description="Should logger retransmit all output wrtten to stdout/stderr"
    ).value(True)
    capture_task_run_log = parameter.help(
        "Capture task output into log").value(True)

    override_airflow_logging_on_task_run = parameter(
        description="Replace airflow logger with databand logger").value(True)
    support_jupyter = parameter(
        description="Support logging output to Jupiter UI").value(False)

    level = parameter(
        description="Logging level. DEBUG/INFO/WARN/ERROR").value("INFO")
    formatter = parameter(
        description="Log formatting string (logging library convention)")[str]
    formatter_colorlog = parameter(
        description="Log formatting string (logging library convention)")[str]
    formatter_simple = parameter(
        description="Log formatting string (logging library convention)")[str]

    console_formatter_name = parameter(
        description="The name of the formatter logging to console output")[str]
    file_formatter_name = parameter(
        description="The name of the formatter logging to file output")[str]

    # sentry config
    sentry_url = parameter(
        default=None,
        description=
        "URL for setting up sentry logger. Notice - make sure the url is exposed to dbnd run environment",
    )[str]
    sentry_env = parameter(default="dev",
                           description="Environment for sentry logger")[str]
    sentry_release = parameter(default="",
                               description="Release for sentry logger")[str]
    sentry_debug = parameter(default=False,
                             description="Unable debug flag for sentry")[bool]

    file_log = parameter(default=None,
                         description="Log to file (off by default)")[str]

    stream_stdout = parameter(
        description="Should databand'a logger stream stdout instead of stderr"
    ).value(False)

    custom_dict_config = parameter(
        default=None,
        description="Advanced: Customized logging configuration")[Callable]

    at_warn = parameter.help("name of loggers to put in WARNING mode").c[
        List[str]]
    at_debug = parameter.help("name of loggers to put in DEBUG mode").c[
        List[str]]

    exception_no_color = parameter(
        default=False,
        description="Do not use colors in exception handling")[bool]
    exception_simple = parameter(
        default=False, description="Simple mode of exception handling")[bool]

    send_body_to_server = parameter(
        default=True,
        description="Enable or disable sending log file to server.")[bool]

    preview_head_bytes = parameter(
        default=0,  # Disabled
        description=
        "Max head size of the log file, bytes to be sent to server.\n"
        "Default: 0KB.",
    )[int]

    preview_tail_bytes = parameter(
        default=0,  # Disabled
        description=
        "Max tail size of the log file, bytes to be sent to server.\n"
        "Default: 0KB.",
    )[int]

    remote_logging_disabled = parameter.help(
        "for tasks using a cloud environment, don't copy the task log to cloud storage"
    ).value(False)

    targets_log_level = parameter(
        default="DEBUG",
        description=
        "Should log the time it takes for marshalling and unmarshalling targets",
    )[str]

    disable_colors = parameter(default=False,
                               description="Disabling any colored logs.")

    sqlalchemy_print = parameter(
        description="enable sqlalchemy logger").value(False)
    sqlalchemy_trace = parameter(
        description="trace sqlalchemy queries").value(False)

    def _initialize(self):
        super(LoggingConfig, self)._initialize()
        self.task_log_file_formatter = None

    def format_exception_as_str(self, exc_info, isolate=True):
        if self.exception_simple:
            return format_exception_as_str(exc_info)

        try:
            from dbnd._vendor.tbvaccine import TBVaccine

            tbvaccine = TBVaccine(
                no_colors=self.exception_no_color,
                show_vars=False,
                skip_non_user_on_isolate=True,
                isolate=isolate,
            )
            return tbvaccine.format_tb(*exc_info)
        except Exception as ex:
            logger.info("Failed to format exception: %s", ex)
            return format_exception_as_str(exc_info)

    def get_dbnd_logging_config(self, filename=None):
        if self.custom_dict_config:
            if not in_quiet_mode():
                logger.info("Using user provided logging config")

            self.log_debug("Using log.custom_dict_config")
            return self.settings.log.custom_dict_config()

        return self.get_dbnd_logging_config_base(filename=filename)

    def get_dbnd_logging_config_base(self, filename=None):
        # type: (LoggingConfig, Optional[str]) -> Optional[dict]
        self.log_debug("Using log.get_dbnd_logging_config_base")
        log_settings = self
        log_level = log_settings.level
        # we want to have "real" output, so nothing can catch our handler
        # in opposite to what airflow is doing
        console_stream = (sys.__stdout__
                          if log_settings.stream_stdout else sys.__stderr__)

        if "ipykernel" in sys.modules and self.support_jupyter:
            # we can not use __stdout__ or __stderr__ as it will not be printed into jupyter web UI
            # at the same time  using sys.stdout when airflow is active is very dangerous
            # as it can create dangerous loop from airflow redirection into root logger

            self.log_debug("ipykernel: checking on console_stream again")
            console_stream = sys.stdout if log_settings.stream_stdout else sys.stderr

        # dummy path, we will not write to this file
        task_file_handler_file = databand_system_path("logs", "task.log")

        self.log_debug("task_file_handler_file: %s", task_file_handler_file)
        setup_log_file(task_file_handler_file)

        config = {
            "version": 1,
            "disable_existing_loggers": False,
            "filters": {
                "task_context_filter": {
                    "()": "dbnd._core.log.logging_utils.TaskContextFilter"
                }
            },
            "formatters": {
                "formatter": {
                    "format": log_settings.formatter
                },
                "formatter_simple": {
                    "format": log_settings.formatter_simple
                },
                "formatter_colorlog": {
                    "()": "dbnd._vendor.colorlog.ColoredFormatter",
                    "format": log_settings.formatter_colorlog,
                    "reset": True,
                },
            },
            "handlers": {
                "console": {
                    "class": "logging.StreamHandler",
                    "stream": console_stream,
                    "formatter": log_settings.console_formatter_name,
                    "filters": ["task_context_filter"],
                }
            },
            "root": {
                "handlers": ["console"],
                "level": log_level
            },
        }
        if filename:
            setup_log_file(filename)
            config["handlers"]["file"] = {
                "class": "logging.FileHandler",
                "formatter": log_settings.file_formatter_name,
                "filename": filename,
                "encoding": "utf-8",
            }
            config["root"]["handlers"].append("file")

        loggers = config.setdefault("loggers", {})
        for logger_warn in log_settings.at_warn:
            loggers[logger_warn] = {
                "level": logging.WARNING,
                "propagate": True
            }

        for logger_debug in log_settings.at_debug:
            loggers[logger_debug] = {"level": logging.DEBUG, "propagate": True}

        if log_settings.sqlalchemy_print:
            loggers["sqlalchemy.engine"] = {
                "level": logging.INFO,
                "propagate": True
            }

        self.log_debug("Log config: %s", config)
        return config

    def configure_dbnd_logging(self):
        if self.disabled:
            self.log_debug("Log is disabled, skipping configure_dbnd_logging")
            return

        # start by trying to initiate Sentry setup - has side effect of changing the logging config
        self.log_debug("Initialize Sentry setup")
        try_init_sentry()

        if self.disable_colors:
            self.log_debug("Colors are disabled")
            self.disable_color_logs()

        dict_config = self.get_dbnd_logging_config(filename=self.file_log)

        airflow_task_log_handler = None
        if self.override_airflow_logging_on_task_run:
            airflow_task_log_handler = self.dbnd_override_airflow_logging_on_task_run(
            )
        try:
            self.log_debug("configure_logging_dictConfig: %s", dict_config)
            configure_logging_dictConfig(dict_config=dict_config)
        except Exception as e:
            # we print it this way, as it could be that now "logging" is down!
            print(
                "Failed to load reload logging configuration with dbnd settings! Exception: %s"
                % (e, ),
                file=sys.__stderr__,
            )
            raise
        if airflow_task_log_handler:
            self.log_debug(
                "logging.root.handlers.append(airflow_task_log_handler)")
            logging.root.handlers.append(airflow_task_log_handler)
        self.log_debug("Databand logging is up!")

    def dbnd_override_airflow_logging_on_task_run(self):
        # EXISTING STATE:
        # root logger use Console handler -> prints to current sys.stdout
        # on `airflow run` without interactive -> we have `redirect_stderr` applied that will redirect sys.stdout
        # into logger `airflow.task`, that will save everything into file.
        #  EVERY output of root logger will go through CONSOLE handler into AIRFLOW.TASK without being printed to screen

        self.log_debug("dbnd_override_airflow_logging_on_task_run")
        if not sys.stderr or not _safe_is_typeof(sys.stderr,
                                                 "StreamLogWriter"):

            self.log_debug(
                "Airflow logging is already replaced by dbnd stream log writer! sys.stderr=%s",
                sys.stderr,
            )
            return

        # NEW STATE
        # we will move airflow.task file handler to root level
        # we will set propogate
        # we will stop redirect of airflow logging

        # this will disable stdout ,stderr redirection
        sys.stderr = sys.__stderr__
        sys.stdout = sys.__stdout__

        airflow_root_console_handler = find_handler(logging.root, "console")

        self.log_debug("airflow_root_console_handler:%s",
                       airflow_root_console_handler)
        if _safe_is_typeof(airflow_root_console_handler, "RedirectStdHandler"):
            # we are removing this console logger
            # this is the logger that capable to create self loop
            # as it writes to "latest" sys.stdout,
            # if you have stdout redirection into any of loggers, that will propogate into root
            # you get very busy message loop that is really hard to debug

            self.log_debug("airflow_root_console_handler has been removed")
            logging.root.handlers.remove(airflow_root_console_handler)

        airflow_task_logger = logging.getLogger("airflow.task")

        self.log_debug("airflow_task_logger: %s", airflow_task_logger)
        airflow_task_log_handler = find_handler(airflow_task_logger, "task")
        if airflow_task_log_handler:
            self.log_debug("airflow_task_log_handler: %s",
                           airflow_task_log_handler)
            logging.root.handlers.append(airflow_task_log_handler)
            airflow_task_logger.propagate = True
            airflow_task_logger.handlers = []
        self.log_debug(
            "dbnd_override_airflow_logging_on_task_run logging.root: %s",
            logging.root)
        return airflow_task_log_handler

    def get_task_log_file_handler(self, log_file):
        if not self.task_log_file_formatter:
            config = self.get_dbnd_logging_config()
            configurator = DictConfigurator(config)
            file_formatter_config = configurator.config.get("formatters").get(
                self.file_formatter_name)
            self.task_log_file_formatter = configurator.configure_formatter(
                file_formatter_config)

        # "formatter": log_settings.file_formatter,
        log_file = str(log_file)
        setup_log_file(log_file)
        handler = logging.FileHandler(filename=log_file, encoding="utf-8")
        handler.setFormatter(self.task_log_file_formatter)
        handler.setLevel(self.level)
        return handler

    def disable_color_logs(self):
        """Removes colors from any console related config"""
        logger.debug("disabling color logs")

        os.environ[
            "ANSI_COLORS_DISABLED"] = "True"  # disabling termcolor.colored
        self.exception_no_color = True
        if self.console_formatter_name == "formatter_colorlog":
            self.console_formatter_name = "formatter_simple"

    def log_debug(self, msg, *args):
        if not self.debug_log_config:
            if not self.disabled:
                # we don't want to print ANYTHING if we are disabled
                logger.debug(msg, *args)
            return

        try:
            # we print to stderr as well in case logging is broken
            print("DEBUG_LOG_CONFIG:" + msg % args, file=sys.__stderr__)
            logger.info("DEBUG_LOG_CONFIG:" + msg, *args)
        except Exception:
            pass
Ejemplo n.º 18
0
class Task(_TaskWithParams, _TaskCtrlMixin, _TaskParamContainer):
    """
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    A ``run`` method must be present in a subclass

    Each ``parameter`` of the Task should be declared as members::

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]
    """

    _conf_confirm_on_kill_msg = None  # get user confirmation on task kill if not empty
    _conf__require_run_dump_file = False

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        system=True,
        output_name="meta",
        output_ext="",
        target_config=folder,
        significant=False,
        description="Location of all internal outputs (e.g. metrics)",
    )
    task_band = output.json(output_name="band", system=True)

    task_enabled = system_passthrough_param(default=True)[bool]
    task_enabled_in_prod = system_passthrough_param(default=True)[bool]
    validate_no_extra_params = ParamValidation.error

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        default="1",
        description="task version, directly affects task signature ",
        scope=ParameterScope.children,
    )[VersionStr]

    task_class_version = parameter.value(
        default=DEFAULT_CLASS_VERSION,
        system=True,
        description="task code version, "
        "use while you want persistent change in your task version",
    )

    task_env = parameter.value(
        default="local",
        description="task environment name",
        scope=ParameterScope.children,
    )[EnvConfig]

    task_target_date = parameter(
        default="today",
        description="task data target date",
        scope=ParameterScope.children,
    )[datetime.date]

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs"
    )[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = system_passthrough_param(
        default=False, description="Store all task outputs in memory"
    )[bool]

    task_output_path_format = system_passthrough_param(
        default=None, description="Format string used to generate task output paths"
    )[str]

    task_is_dynamic = system_passthrough_param(
        default=False,
        scope=ParameterScope.children,
        description="task was executed from within another task",
    )[bool]

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        default=True, description="indicates if task can run dynamic databand tasks"
    )[bool]

    task_retries = parameter.system(
        default=0,
        description="Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up",
    )[int]

    task_retry_delay = parameter.system(
        default="15s",
        description="timedelta to wait before retrying a task. Example: 5s",
    )[datetime.timedelta]

    task_essence = TaskEssence.ORCHESTRATION

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)

        # used to communicate return value of "user function"
        self._dbnd_call_state = None  # type: Optional[TaskCallState]
        self.ctrl = TaskCtrl(self)

    def band(self):
        """
        Please, do not override this function only in Pipeline/External tasks!

        We do all wiring work in Meta classes only.
        Our implementation should never be coupled to code!
        """
        return

    def run(self):
        """
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        """
        pass  # default impl

    @property
    def task_outputs(self):
        """
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        """
        return self.ctrl.relations.task_outputs_user

    @property
    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    @property
    def descendants(self):
        return self.ctrl.descendants

    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist. Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = [
            o for o in flatten(self.task_outputs) if not o.config.overwrite_target
        ]
        if len(outputs) == 0:
            if not self.task_band:
                warnings.warn(
                    "Task %r without outputs has no custom complete() and no task band!"
                    % self,
                    stacklevel=2,
                )
                return False
            else:
                return self.task_band.exists()

        incomplete_outputs = [str(o) for o in outputs if not o.exists()]

        num_of_incomplete_outputs = len(incomplete_outputs)

        if 0 < num_of_incomplete_outputs < len(outputs):
            complete_outputs = [str(o) for o in outputs if o.exists()]
            exc = incomplete_output_found_for_task(
                self.task_name, complete_outputs, incomplete_outputs
            )

            if self.settings.run.validate_task_outputs_on_build:
                raise exc
            else:
                logger.warning(str(exc))

        return num_of_incomplete_outputs == 0

    @property
    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        """
        The default output that this Task produces.

        Use outputs! Override only if you are writing "base" class.
        """
        return NOTHING

    def _requires(self):
        """
        Override in "template" tasks which themselves are supposed to be subclassed.

        Must return an iterable which, among others, contains the _requires() of
        the superclass.
        """

    def _task_submit(self):
        """Task submission logic, by default we just call -> ``_task_run()`` -> ``run()``."""
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        self.current_task_run.sync_local.sync_pre_execute()
        param_values = self.task_params.get_param_values()

        with auto_load_save_params(
            task=self, auto_read=self._conf_auto_read_params, param_values=param_values
        ):
            result = self.run()

        self.current_task_run.sync_local.sync_post_execute()
        # publish all relevant files
        return result

    @property
    def tracker(self):
        return self.current_task_run.tracker

    @property
    def metrics(self):
        # backward compatible code
        return self.tracker

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_family,
            "task_name": self.task_name,
            "task_signature": self.task_signature,
            "task_id": self.task_id,
        }
        base.update(self._params.get_params_serialized(ParameterFilters.INPUTS))
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        """
        Override this method to cleanup subprocesses when a task instance gets killed.

        Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.
        """

    def _get_task_output_path_format(self, output_mode):
        """
        Defines the format string used to generate all task outputs.

        For example:
           {root}/{env_label}/{task_target_date}/{task_name}/{task_name}{task_class_version}_{task_signature}/{output_name}{output_ext}
        """
        if self.task_output_path_format:
            # explicit input - first priority
            return self.task_output_path_format
        if self._conf__base_output_path_fmt:
            # from class definition
            return self._conf__base_output_path_fmt

        # default behaviour
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=self,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()
        self.ctrl._initialize_task()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    def _save_param(self, parameter, original_value, current_value):
        # type: (ParameterDefinition, Any, Any) -> None
        # it's output! we are going to save it.
        # task run doesn't always exist
        task_run = try_get_current_task_run()
        access_status = DbndTargetOperationStatus.OK
        try:
            if isinstance(original_value, InMemoryTarget):
                parameter.value_type = get_value_type_of_obj(
                    current_value, parameter.value_type
                )

            parameter.dump_to_target(original_value, current_value)
            # it's a workaround, we don't want to change parameter for outputs (dynamically)
            # however, we need proper value type to "dump" preview an other meta.
            # we will update it only for In memory targets only for now

        except Exception as ex:
            access_status = DbndTargetOperationStatus.NOK
            raise friendly_error.task_execution.failed_to_save_value_to_target(
                ex, self, parameter, original_value, current_value
            )
        finally:
            if task_run:
                try:
                    task_run.tracker.log_parameter_data(
                        parameter=parameter,
                        target=original_value,
                        value=current_value,
                        operation_type=DbndTargetOperationType.write,
                        operation_status=access_status,
                    )
                except Exception as ex:
                    logger.warning("Failed to log target to tracking store. %s", ex)

    @dbnd_handle_errors(exit_on_error=False)
    def dbnd_run(self):
        # type: (...)-> DatabandRun
        """Run task via Databand execution system."""
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        run = ctx.dbnd_run_task(self)
        return run
Ejemplo n.º 19
0
Archivo: env.py Proyecto: cxz/dbnd
import logging

from dbnd._core.constants import ApacheBeamClusterType, CloudType, EnvLabel
from dbnd._core.errors import friendly_error
from dbnd._core.parameter.parameter_builder import parameter
from dbnd._core.parameter.parameter_definition import ParameterScope
from dbnd._core.task.config import Config
from targets import DirTarget

logger = logging.getLogger(__name__)

task_env_param = parameter(scope=ParameterScope.children)


class EnvConfig(Config):
    """Databand's environment configuration"""

    _conf__task_family = "env"
    cloud_type = parameter(description="cloud type: gcp/aws/")[str]

    env_label = parameter(
        default=EnvLabel.dev,
        description="environment type: dev/int/prod")[str]  # label

    production = parameter(
        description="indicates that environment is production").value(False)

    conn_id = parameter(default=None,
                        description="cloud connection settings")[str]

    # MAIN OUTPUT FOLDER
Ejemplo n.º 20
0
Archivo: task.py Proyecto: lbtanh/dbnd
class Task(_BaseTask, _TaskParamContainer):
    """
    This is the base class of all dbnd Tasks, the base unit of work in databand.

    A dbnd Task describes a unit or work.

    The key methods of a Task, which must be implemented in a subclass are:

    * :py:meth:`run` - the computation done by this task.

    Each :py:class:`~dbnd.parameter` of the Task should be declared as members:

    .. code:: python

        class MyTask(dbnd.Task):
            count = dbnd.parameter[int]
            second_param = dbnd.parameter[str]

    In addition to any declared properties and methods, there are a few
    non-declared properties, which are created by the :py:class:`TaskMetaclass`
    metaclass:

    """
    """
        This value can be overriden to set the namespace that will be used.
        (See :ref:`Task.namespaces_famlies_and_ids`)
        If it's not specified and you try to read this value anyway, it will return
        garbage. Please use :py:meth:`get_task_namespace` to read the namespace.

        Note that setting this value with ``@property`` will not work, because this
        is a class level value.
    """

    _task_band_result = output(default=None, system=True)
    _meta_output = output(
        system=True,
        output_name="meta",
        output_ext="",
        target_config=folder,
        significant=False,
        description="Location of all internal outputs (e.g. metrics)",
    )
    task_band = output.json(output_name="band")

    task_enabled = parameter.system(scope=ParameterScope.children)[bool]
    task_enabled_in_prod = parameter.system(
        scope=ParameterScope.children)[bool]

    # for permanent bump of task version use Task.task_class_version
    task_version = parameter(
        description="task version, directly affects task signature ",
        scope=ParameterScope.children,
    )[VersionStr]

    task_class_version = parameter.value(
        default=DEFAULT_CLASS_VERSION,
        system=True,
        description="task code version, "
        "use while you want persistent change in your task version",
    )

    task_env = parameter.value(description="task environment name",
                               scope=ParameterScope.children)[EnvConfig]

    task_target_date = parameter(description="task data target date",
                                 scope=ParameterScope.children)[datetime.date]

    task_airflow_op_kwargs = parameter.system(
        default=None, description="airflow operator kwargs")[Dict[str, object]]

    task_config = parameter.system(empty_default=True)[Dict]
    task_is_system = parameter.system(default=False)[bool]

    task_in_memory_outputs = parameter.system(
        scope=ParameterScope.children,
        description="Store all task outputs in memory")[bool]
    task_is_dynamic = parameter.system(
        scope=ParameterScope.children,
        description="task was executed from within another task",
    )[bool]

    # for example: if task.run doesn't have access to databand, we can't run runtime tasks
    task_supports_dynamic_tasks = parameter.system(
        default=True,
        description="indicates if task can run dynamic databand tasks")[bool]

    task_retries = parameter.system(
        description=
        "Total number of attempts to run the task. So task_retries=3 -> task can fail 3 times before we give up"
    )[int]

    task_retry_delay = parameter.system(
        description="timedelta to wait before retrying a task. Example: 5s")[
            datetime.timedelta]

    _dbnd_call_state = None  # type: TaskCallState

    def __init__(self, **kwargs):
        super(Task, self).__init__(**kwargs)
        self.ctrl = TaskCtrl(self)

    def band(self):
        """
        Please, do not override this function only in Pipeline/External tasks! we do all wiring work in Meta classes only
        Our implementation should never be coupled to code!
        :return:
        """
        return

    def run(self):
        """
        The task run method, to be overridden in a subclass.

        See :ref:`Task.run`
        """
        pass  # default impl

    @property
    def task_outputs(self):
        """
        The output that this Task produces.

        The output of the Task determines if the Task needs to be run--the task
        is considered finished iff the outputs all exist.
        See :ref:`Task.task_outputs`
        """
        return self.ctrl.relations.task_outputs_user

    @property
    def task_dag(self):
        # type: (...)->_TaskDagNode
        return self.ctrl.task_dag

    def _complete(self):
        """
        If the task has any outputs, return ``True`` if all outputs exist.
        Otherwise, return ``False``.

        However, you may freely override this method with custom logic.
        """
        # we check only user side task outputs
        # all system tasks outputs are not important (if the exists or not)
        # user don't see them
        outputs = flatten(self.task_outputs)
        if len(outputs) == 0:
            warnings.warn(
                "Task %r without outputs has no custom complete() method" %
                self,
                stacklevel=2,
            )
            return False

        return all((o.exists() for o in outputs))

    @property
    def current_task_run(self):
        # type: ()->TaskRun
        return get_databand_run().get_task_run(self.task_id)

    def _output(self):
        """
        The default output that this Task produces. Use outputs! Override only if you are writing "base" class
        """
        return NOTHING

    def _requires(self):
        """
        Override in "template" tasks which themselves are supposed to be
        subclassed

        Must return an iterable which among others contains the _requires() of
        the superclass.
        See :ref:`Task.requires`
        """
        pass

    def _task_submit(self):
        """
        Task submission logic, by default we just call -> _task_run() -> run()
        """
        return self._task_run()

    def _task_run(self):
        # bring all relevant files
        self.current_task_run.sync_local.sync_pre_execute()
        with self._auto_load_save_params(auto_read=self._conf_auto_read_params,
                                         save_on_change=True):
            result = self.run()

        self.current_task_run.sync_local.sync_post_execute()
        # publish all relevant files
        return result

    def set_upstream(self, task_or_task_list):
        self.task_dag.set_upstream(task_or_task_list)

    def set_downstream(self, task_or_task_list):
        self.task_dag.set_downstream(task_or_task_list)

    def __lshift__(self, other):
        return self.set_upstream(other)

    def __rshift__(self, other):
        return self.set_downstream(other)

    def set_global_upstream(self, task_or_task_list):
        self.task_dag.set_global_upstream(task_or_task_list)

    @property
    def metrics(self):
        # backward compatible code
        return self.current_task_run.tracker

    def log_dataframe(
        self,
        key,
        df,
        with_preview=True,
        with_schema=True,
        with_size=True,
        with_stats=False,
    ):
        meta_conf = ValueMetaConf(
            log_preview=with_preview,
            log_schema=with_schema,
            log_size=with_size,
            log_stats=with_stats,
        )
        self.metrics.log_dataframe(key, df, meta_conf=meta_conf)

    def log_metric(self, key, value, source=None):
        """
        Logs the passed-in parameter under the current run, creating a run if necessary.
        :param key: Parameter name (string)
        :param value: Parameter value (string)
        """
        return self.metrics.log_metric(key, value, source=source)

    def log_system_metric(self, key, value):
        """Shortcut for log_metric(..., source="system") """
        return self.log_metric(key, value, source="system")

    def log_artifact(self, name, artifact):
        """Log a local file or directory as an artifact of the currently active run."""
        return self.metrics.log_artifact(name, artifact)

    def get_template_vars(self):
        # TODO: move to cached version, (after relations are built)
        base = {
            "task": self,
            "task_family": self.task_meta.task_family,
            "task_name": self.task_meta.task_name,
            "task_signature": self.task_meta.task_signature,
            "task_id": self.task_meta.task_id,
        }
        base.update(self._params.get_params_serialized(input_only=True))
        if self.task_target_date is None:
            base["task_target_date"] = "input"
        return base

    def on_kill(self):
        """
        Override this method to cleanup subprocesses when a task instance
        gets killed. Any use of the threading, subprocess or multiprocessing
        module within an operator needs to be cleaned up or it will leave
        ghost processes behind.
        """
        pass

    def _get_task_output_path_format(self, output_mode):
        if self.task_env.production and output_mode == OutputMode.prod_immutable:
            return self.settings.output.path_prod_immutable_task
        return self._conf__base_output_path_fmt or self.settings.output.path_task

    def get_target(self, name, config=None, output_ext=None, output_mode=None):
        name = name or "tmp/dbnd-tmp-%09d" % random.randint(0, 999999999)
        config = config or TargetConfig()
        path_pattern = self._get_task_output_path_format(output_mode)

        path = calculate_path(
            task=self,
            name=name,
            output_ext=output_ext,
            is_dir=config.folder,
            path_pattern=path_pattern,
        )

        return target(path, config=config)

    def get_root(self):
        return self.task_env.root

    def _initialize(self):
        super(Task, self)._initialize()
        self.ctrl._initialize_task()

    def _should_run(self):
        if not self.task_enabled:
            return False

        if self.task_env.production:
            return self.task_enabled_in_prod or self.settings.run.enable_prod

        return True

    @dbnd_handle_errors(exit_on_error=False)
    def dbnd_run(self):
        # type: (...)-> DatabandRun
        """
        Run task via Databand execution system
        """
        # this code should be executed under context!
        from dbnd._core.current import get_databand_context

        ctx = get_databand_context()
        result = ctx.dbnd_run_task(self)
        return result
Ejemplo n.º 21
0
class LoggingConfig(config.Config):
    """Databand's logger configuration"""

    _conf__task_family = "log"
    disabled = parameter(description="Should logging be disabled").value(False)
    capture_stdout_stderr = parameter(
        description="Should logger retransmit all output wrtten to stdout/stderr"
    ).value(True)
    capture_task_run_log = parameter.help(
        "Capture task output into log").value(True)

    override_airflow_logging_on_task_run = parameter(
        description="Replace airflow logger with databand logger").value(True)
    support_jupyter = parameter(
        description="Support logging output to Jupiter UI").value(False)

    level = parameter(
        description="Logging level. DEBUG/INFO/WARN/ERROR").value("INFO")
    formatter = parameter(
        description="Log formatting string (logging library convention)")[str]
    formatter_colorlog = parameter(
        description="Log formatting string (logging library convention)")[str]
    formatter_simple = parameter(
        description="Log formatting string (logging library convention)")[str]

    console_formatter_name = parameter(
        description="The name of the formatter logging to console output")[str]
    file_formatter_name = parameter(
        description="The name of the formatter logging to file output")[str]

    sentry_url = parameter(default=None,
                           description="URL for setting up sentry logger")[str]
    sentry_env = parameter(default=None,
                           description="Envrionment for sentry logger")[str]

    file_log = parameter(default=None,
                         description="Log to file (off by default)")[str]

    stream_stdout = parameter(
        description="Should databand'a logger stream stdout instead of stderr"
    ).value(False)

    custom_dict_config = parameter(
        default=None,
        description="Advanced: Customized logging configuration")[Callable]

    at_warn = parameter.help("name of loggers to put in WARNING mode").c[
        List[str]]
    at_debug = parameter.help("name of loggers to put in DEBUG mode").c[
        List[str]]

    exception_no_color = parameter(
        default=False,
        description="Do not use colors in exception handling")[bool]
    exception_simple = parameter(
        default=False, description="Simple mode of exception handling")[bool]

    send_body_to_server = parameter(
        default=True,
        description="Enable or disable sending log file to server.")[bool]

    send_body_to_server_max_size = parameter(
        default=16 * 1024 * 1024,  # 16MB
        description="Max log file size in bytes to be sent to server.\n"
        "\t* use -1 for unlimited;"
        "\t* use negative (e.g. -1000) to get log's 'head' instead of 'tail'."
        "Default: 16MB.",
    )[int]

    remote_logging_disabled = parameter.help(
        "for tasks using a cloud environment, don't copy the task log to cloud storage"
    ).value(False)

    targets_log_level = parameter(
        default="DEBUG",
        description=
        "Should log the time it takes for marshalling and unmarshalling targets",
    )[str]

    sqlalchemy_print = parameter(
        description="enable sqlalchemy logger").value(False)
    sqlalchemy_trace = parameter(
        description="trace sqlalchemy queries").value(False)
    api_profile = parameter(description="profile api calls").value(False)

    def _initialize(self):
        super(LoggingConfig, self)._initialize()
        self.task_log_file_formatter = None

    def format_exception_as_str(self, exc_info, isolate=True):
        if self.exception_simple:
            return format_exception_as_str(exc_info)

        try:
            tbvaccine = TBVaccine(
                no_colors=self.exception_no_color,
                show_vars=False,
                skip_non_user_on_isolate=True,
                isolate=isolate,
            )
            return tbvaccine.format_tb(*exc_info)
        except Exception as ex:
            logger.info("Failed to format exception: %s", ex)
            return format_exception_as_str(exc_info)

    def get_dbnd_logging_config(self, filename=None):
        if self.custom_dict_config:
            if not in_quiet_mode():
                logger.info("Using user provided logging config")
            return self.settings.log.custom_dict_config()

        return self.get_dbnd_logging_config_base(filename=filename)

    def get_dbnd_logging_config_base(self, filename=None):
        # type: (LoggingConfig, Optional[str]) -> Optional[dict]
        log_settings = self
        log_level = log_settings.level
        # we want to have "real" output, so nothing can catch our handler
        # in opposite to what airflow is doing
        console_stream = (sys.__stdout__
                          if log_settings.stream_stdout else sys.__stderr__)

        if "ipykernel" in sys.modules and self.support_jupyter:
            # we can not use __stdout__ or __stderr__ as it will not be printed into jupyter web UI
            # at the same time  using sys.stdout when airflow is active is very dangerous
            # as it can create dangerous loop from airflow redirection into root logger
            console_stream = sys.stdout if log_settings.stream_stdout else sys.stderr

        # dummy path, we will not write to this file
        task_file_handler_file = databand_system_path("logs", "task.log")
        setup_log_file(task_file_handler_file)

        config = {
            "version": 1,
            "disable_existing_loggers": False,
            "filters": {
                "task_context_filter": {
                    "()": "dbnd._core.log.logging_utils.TaskContextFilter"
                }
            },
            "formatters": {
                "formatter": {
                    "format": log_settings.formatter
                },
                "formatter_simple": {
                    "format": log_settings.formatter_simple
                },
                "formatter_colorlog": {
                    "()": "dbnd._vendor.colorlog.ColoredFormatter",
                    "format": log_settings.formatter_colorlog,
                    "reset": True,
                },
            },
            "handlers": {
                "console": {
                    "class": "logging.StreamHandler",
                    "stream": console_stream,
                    "formatter": log_settings.console_formatter_name,
                    "filters": ["task_context_filter"],
                }
            },
            "root": {
                "handlers": ["console"],
                "level": log_level
            },
        }
        if filename:
            setup_log_file(filename)
            config["handlers"]["file"] = {
                "class": "logging.FileHandler",
                "formatter": log_settings.file_formatter_name,
                "filename": filename,
                "encoding": "utf-8",
            }
            config["root"]["handlers"].append("file")

        loggers = config.setdefault("loggers", {})
        for logger_warn in log_settings.at_warn:
            loggers[logger_warn] = {
                "level": logging.WARNING,
                "propagate": True
            }

        for logger_debug in log_settings.at_debug:
            loggers[logger_debug] = {"level": logging.DEBUG, "propagate": True}

        if log_settings.sqlalchemy_print:
            loggers["sqlalchemy.engine"] = {
                "level": logging.INFO,
                "propagate": True
            }

        if log_settings.sentry_url:
            config["handlers"]["sentry"] = get_sentry_logging_config(
                sentry_url=log_settings.sentry_url,
                sentry_env=log_settings.sentry_env)
            config["root"]["handlers"].append("sentry")

        return config

    def configure_dbnd_logging(self):
        if self.disabled:
            return

        dict_config = self.get_dbnd_logging_config(filename=self.file_log)

        airflow_task_log_handler = None
        if self.override_airflow_logging_on_task_run:
            airflow_task_log_handler = self.dbnd_override_airflow_logging_on_task_run(
            )

        configure_logging_dictConfig(dict_config=dict_config)

        if airflow_task_log_handler:
            logging.root.handlers.append(airflow_task_log_handler)
        logger.debug("Databand logging is up!")

    def dbnd_override_airflow_logging_on_task_run(self):
        # EXISTING STATE:
        # root logger use Console handler -> prints to current sys.stdout
        # on `airflow run` without interactive -> we have `redirect_stderr` applied that will redirect sys.stdout
        # into logger `airflow.task`, that will save everything into file.
        #  EVERY output of root logger will go through CONSOLE handler into AIRFLOW.TASK without being printed to screen

        if not sys.stderr or not _safe_is_typeof(sys.stderr,
                                                 "StreamLogWriter"):
            logger.debug(
                "Airflow logging is already replaced by dbnd stream log writer!"
            )
            return

        # NEW STATE
        # we will move airflow.task file handler to root level
        # we will set propogate
        # we will stop redirect of airflow logging

        # this will disable stdout ,stderr redirection
        sys.stderr = sys.__stderr__
        sys.stdout = sys.__stdout__

        airflow_root_console_handler = find_handler(logging.root, "console")

        if _safe_is_typeof(airflow_root_console_handler, "RedirectStdHandler"):
            # we are removing this console logger
            # this is the logger that capable to create self loop
            # as it writes to "latest" sys.stdout,
            # if you have stdout redirection into any of loggers, that will propogate into root
            # you get very busy message loop that is really hard to debug
            logging.root.handlers.remove(airflow_root_console_handler)

        airflow_task_logger = logging.getLogger("airflow.task")
        airflow_task_log_handler = find_handler(airflow_task_logger, "task")
        if airflow_task_log_handler:
            logging.root.handlers.append(airflow_task_log_handler)
            airflow_task_logger.propagate = True
            airflow_task_logger.handlers = []

        return airflow_task_log_handler

    def get_task_log_file_handler(self, log_file):
        if not self.task_log_file_formatter:
            config = self.get_dbnd_logging_config()
            configurator = DictConfigurator(config)
            file_formatter_config = configurator.config.get("formatters").get(
                self.file_formatter_name)
            self.task_log_file_formatter = configurator.configure_formatter(
                file_formatter_config)

        # "formatter": log_settings.file_formatter,
        log_file = str(log_file)
        setup_log_file(log_file)
        handler = logging.FileHandler(filename=log_file, encoding="utf-8")
        handler.setFormatter(self.task_log_file_formatter)
        handler.setLevel(self.level)
        return handler
Ejemplo n.º 22
0
Archivo: env.py Proyecto: cxz/dbnd
class EnvConfig(Config):
    """Databand's environment configuration"""

    _conf__task_family = "env"
    cloud_type = parameter(description="cloud type: gcp/aws/")[str]

    env_label = parameter(
        default=EnvLabel.dev,
        description="environment type: dev/int/prod")[str]  # label

    production = parameter(
        description="indicates that environment is production").value(False)

    conn_id = parameter(default=None,
                        description="cloud connection settings")[str]

    # MAIN OUTPUT FOLDER
    root = parameter.folder[DirTarget]

    # DATABAND SYSTEM FOLDERS
    dbnd_root = parameter(description="DBND rooted home folder").output.folder(
        default=None)[DirTarget]
    dbnd_local_root = parameter(
        description="DBND home for the local engine environment"
    ).output.folder()[DirTarget]
    dbnd_data_sync_root = parameter(
        description="Rooted directory for target syncing against remote engine"
    ).output.folder()[DirTarget]

    # execution
    local_engine = parameter(default="local_machine_engine",
                             description="Engine for local execution")[str]
    remote_engine = parameter(
        description="Remote engine for driver/tasks execution").none[str]

    submit_driver = parameter(
        description="Submit driver to remote_engine").none[bool]
    submit_tasks = parameter(
        description="Submit tasks to remote engine one by one").none[bool]

    # properties that will affect "task-env" section
    spark_config = task_env_param.help("Spark Configuration").value("spark")

    spark_engine = task_env_param.help(
        "Cluster engine (local/emr(aws)/dataproc(gcp)/..").value("spark_local")

    hdfs = task_env_param.help("Hdfs cluster config").value("hdfs_knox")

    beam_config = task_env_param.help("Apache Beam configuration").value(
        "beam")
    beam_engine = task_env_param.help(
        "Apache Beam cluster engine (local/dataflow)").value(
            ApacheBeamClusterType.local)

    docker_engine = task_env_param.help(
        "Docker job engine (docker/aws_batch)").value("docker")

    def _initialize(self):
        super(EnvConfig, self)._initialize()
        try:
            self.dbnd_root = self.dbnd_root or self.root.folder("dbnd")

            if not self.dbnd_local_root:
                if not self.dbnd_root.is_local():
                    raise friendly_error.config.dbnd_root_local_not_defined(
                        self.name)
                self.dbnd_local_root = self.dbnd_root
        except Exception as e:
            raise friendly_error.task_build.failed_to_access_dbnd_home(
                self.dbnd_root, e)

        if not self.dbnd_data_sync_root:
            self.dbnd_data_sync_root = self.dbnd_root.folder("sync")

        if self.submit_driver is None:
            self.submit_driver = bool(self.remote_engine)

        if self.submit_tasks is None:
            self.submit_tasks = bool(self.remote_engine)

    @property
    def name(self):
        return self.task_meta.task_name

    @property
    def cloud_type(self):
        return self.task_meta.task_family

    def prepare_env(self):
        pass
Ejemplo n.º 23
0
class RunConfig(config.Config):
    """Databand's per run settings (e.g. execution date)"""

    _conf__task_family = "run"

    ######
    # on none generated at DatabandRun
    name = parameter.value(default=None, description="Specify run name")[str]

    description = parameter.value(default=None, description="Specify run description")[
        Optional[str]
    ]

    # Executor configuration
    parallel = parameter(default=None)[bool]
    task_executor_type = parameter(
        default=None,
        description="Alternate executor type: "
        " local/airflow_inprocess/airflow_multiprocess_local/airflow_kubernetes,"
        "  see docs for more options",
    )[str]

    enable_airflow_kubernetes = parameter(
        default=True,
        description="Enable use of kubernetes executor for kubebernetes engine submission",
    )[bool]

    ######
    # Local/Remote control
    interactive = parameter(
        default=False,
        description="When submitting driver to remote execution keep tracking of submitted process and wait for completion",
    )[bool]
    submit_driver = parameter(
        description="override env.submit_driver for specific environment"
    ).none[bool]
    submit_tasks = parameter(
        description="override env.submit_tasks for specific environment"
    ).none[bool]

    # What to do on run
    open_web_tracker_in_browser = parameter(
        description="If True, open web tracker in browser during task run."
    ).value(False)

    is_archived = parameter(description="Save this run in the archive").value(False)

    dry = parameter(default=False).help(
        "Do not execute tasks, stop before sending them to the execution, and print their status"
    )[bool]

    run_result_json_path = parameter(default=None).help(
        "The path to save the task band of the run"
    )[str]

    debug_pydevd_pycharm_port = parameter(default=None).help(
        "Enable debugging with `pydevd_pycharm` by setting this to the port value expecting the debugger to connect.\n"
        "This will start a new `settrace` connecting to `localhost` on the requested port, "
        "right before starting the driver task_run."
    )[int]

    ######
    # AIRFLOW EXECUTOR CONFIG
    execution_date = parameter(default=None, description="Override execution date")[
        datetime
    ]
    mark_success = parameter(
        description="Mark jobs as succeeded without running them"
    ).value(False)

    ######
    # Task Selectors (to schedule specific task from pipeline)
    id = parameter(default=None, description="The list of task ids to run")[List[str]]
    selected_tasks_regex = parameter(
        default=None, description="Run only specified tasks (regular expresion)"
    )[List[str]]

    ignore_dependencies = parameter(
        description="The regex to filter specific task_ids"
    ).value(False)
    ignore_first_depends_on_past = parameter(
        description="The regex to filter specific task_ids"
    ).value(False)

    ######
    # Scheduler configuration

    skip_completed = parameter(
        description="Mark jobs as succeeded without running them"
    ).value(True)
    fail_fast = parameter(
        description="Skip all remaining tasks if a task has failed"
    ).value(True)
    enable_prod = parameter(description="Enable production tasks").value(False)

    skip_completed_on_run = parameter(default=True).help(
        "Should dbnd task check that task is completed and mark it as re-used on task execution"
    )[bool]

    validate_task_inputs = parameter(default=True).help(
        "Should dbnd task check that all input files exist"
    )[bool]

    validate_task_outputs = parameter(default=True).help(
        "Should dbnd task check that all outputs exist after task has been executed"
    )[bool]

    validate_task_outputs_on_build = parameter(default=False).help(
        "Should dbnd task check that there are no incomplete outputs before task executes"
    )[bool]

    pipeline_band_only_check = parameter(default=False).help(
        "When checking if pipeline is completed, check only if the band file exist (skip the tasks)"
    )[bool]

    recheck_circle_dependencies = parameter(
        description="Re check circle dependencies on every task creation,"
        " use it if you need to find of circle in your graph "
    ).value(False)

    task_complete_parallelism_level = parameter(default=1).help(
        "Number of threads to use when checking if tasks are already complete"
    )[int]

    pool = parameter(default=None, description="Resource pool to use")[str]

    ######
    # Advanced Run settings (debug/workarounds)
    # run .pickle file
    always_save_pipeline = parameter(
        description="Boolean for always saving pipeline to pickle"
    ).value(False)
    disable_save_pipeline = parameter(
        description="Boolean for disabling pipeline pickling"
    ).value(False)
    donot_pickle = parameter(
        description="Do not attempt to pickle the DAG object to send over "
        "to the workers, just tell the workers to run their version "
        "of the code."
    ).value(False)
    pickle_handler = parameter(
        default=None,
        description="Defines a python pickle handler to be used to pickle the "
        "run's data",
    )[str]
    enable_concurent_sqlite = parameter(
        description="Enable concurrent execution with sqlite db (use only for debug!)"
    ).value(False)

    ######
    # HEARTBEAT (process that updates on driver status every `heartbeat_interval_s`
    #
    heartbeat_interval_s = parameter(
        description="How often a run should send a heartbeat to the server. Set -1 to disable"
    )[int]
    heartbeat_timeout_s = parameter(
        description="How old can a run's last heartbeat be before we consider it failed. Set -1 to disable"
    )[int]
    heartbeat_sender_log_to_file = parameter(
        description="create a separate log file for the heartbeat sender and don't log the run process stdout"
    )[bool]

    hearbeat_disable_plugins = parameter(
        default=False, description="disable dbnd plugins at heartbeat sub-process"
    )[bool]
    ######
    # Task/Pipeline in task Execution
    task_run_at_execution_time_enabled = parameter(
        default=True, description="Allow tasks calls during another task execution"
    )[bool]
    task_run_at_execution_time_in_memory_outputs = parameter(
        default=False,
        description="Store outputs for inline task at execution time in memory (do not use FileSystem)",
    )[bool]
    target_cache_on_access = parameter(
        default=True, description="Cache targets values in memory during execution"
    )[bool]