def run_execute(self): """ Runs the main driver! """ run = self.run run.tracker.init_run() run.set_run_state(RunState.RUNNING) # with captures_log_into_file_as_task_file(log_file=self.local_driver_log.path): try: if self.run_config.debug_pydevd_pycharm_port is not None: debug_port = self.run_config.debug_pydevd_pycharm_port # currently no need to export the host to configuration. start_pycharm_debugger(host="localhost", port=debug_port) self.start_time = utcnow() run.driver_task_run.runner.execute() self.finished_time = utcnow() # if we are in submitter we don't want to print banner that everything is good except DatabandRunError as ex: self._dbnd_run_error(ex) raise except (Exception, KeyboardInterrupt, SystemExit) as ex: raise self._dbnd_run_error(ex) finally: try: self.host_engine.cleanup_after_run() except Exception: logger.exception( "Failed to shutdown the current run, continuing") return self
def wait(self): """ Waits for pod completion :return: """ self._wait_for_pod_started() self.log.info("Pod is running, reading logs..") self.stream_pod_logs(follow=True) self.log.info("Successfully read pod logs") pod_phase = self.get_pod_phase() wait_start = utcnow() while pod_phase not in {PodPhase.SUCCEEDED, PodPhase.FAILED}: logger.debug( "Pod '%s' is not completed with state %s, waiting..", self.name, pod_phase, ) if (utcnow() - wait_start ) > self.kube_config.submit_termination_grace_period: raise DatabandRuntimeError( "Pod is not in a final state after {grace_period}: {state}" .format( grace_period=self.kube_config. submit_termination_grace_period, state=pod_phase, )) time.sleep(5) pod_phase = self.get_pod_phase() if pod_phase != PodPhase.SUCCEEDED: raise DatabandRuntimeError( "Pod returned a failure: {pod_phase}".format( pod_phase=pod_phase)) return self
def _log_metric(self, key, value, timestamp=None, source=None, is_json=False): # type: (str, Any, Optional[datetime], Optional[MetricSource], Optional[bool]) -> None if is_json: metric = Metric(key=key, timestamp=timestamp or utcnow(), value_json=value) else: metric = Metric(key=key, timestamp=timestamp or utcnow(), value=value) self.tracking_store.log_metric( task_run=self.task_run, metric=metric, source=source )
def parse_from_str(self, s): """ Parses a date string formatted like ``YYYY-MM-DD``. """ if s.lower() == DateAlias.today: return utcnow().date() elif s.lower() == DateAlias.yesterday: return utcnow().date() - datetime.timedelta(days=1) else: return datetime.datetime.strptime(s, self.date_format).date()
def unique_execution_date(): """ create unique exec date for every run ( so it runs! ) it should not success datetime.now() """ global _current_unique_execution_date_salt _current_unique_execution_date_salt += 1 if _current_unique_execution_date_salt == 1: return utcnow() # just return current utc # task can't run ahead of time! deadlock in scheduler so we go back 1 sec # and add unique salt - every time we have new execution date return (utcnow() - timedelta(seconds=1) + timedelta(microseconds=_current_unique_execution_date_salt))
def stop(self): if not self._active: return self._active = False try: databand_run = self._run root_tr = self._task_run root_tr.finished_time = utcnow() if root_tr.task_run_state not in TaskRunState.finished_states(): for tr in databand_run.task_runs: if tr.task_run_state == TaskRunState.FAILED: root_tr.set_task_run_state( TaskRunState.UPSTREAM_FAILED) break else: root_tr.set_task_run_state(TaskRunState.SUCCESS) driver_tr = databand_run.driver_task.current_task_run if driver_tr.task_run_state not in TaskRunState.finished_states(): driver_tr.set_task_run_state(TaskRunState.SUCCESS) if root_tr.task_run_state == TaskRunState.SUCCESS: databand_run.set_run_state(RunState.SUCCESS) else: databand_run.set_run_state(RunState.FAILED) logger.info(databand_run.describe.run_banner_for_finished()) self._close_all_context_managers() except Exception as ex: _handle_inline_run_error("dbnd-tracking-shutdown")
def __init__(self, name="global"): """ Initialize cmd line args """ super(DatabandContext, self).__init__() # double checking on bootstrap, as we can run from all kind of locations # usually we should be bootstraped already as we run from cli. dbnd_bootstrap() self.name = name self.current_context_uid = "%s_%s" % ( utcnow().strftime("%Y%m%d_%H%M%S"), str(uuid.uuid1())[:8], ) self.config = config # we are running from python notebook, let start to print to stdout if self.name == "interactive" or is_unit_test_mode(): self.config.set("log", "stream_stdout", "True", source="log") self.task_instance_cache = TaskInstanceCache() self.user_code_detector = UserCodeDetector.build_code_detector() # will set up in __enter__ # we can't initialize settings without having self defined as context # we assign real object only in _on_enter, however it's great for auto completion from dbnd._core.settings import DatabandSettings self.settings = None # type: DatabandSettings self._is_initialized = False self._tracking_store = None
def set_unfinished_tasks_state(self, run_uid, state): return self._m( self.channel.set_unfinished_tasks_state, run_uid=run_uid, state=state, timestamp=utcnow(), )
def log_target( self, task_run, target, target_meta, # type: ValueMeta operation_type, # type: DbndTargetOperationType operation_status, # type: DbndTargetOperationStatus param_name=None, # type: Optional[str] task_def_uid=None, # type: Optional[UUID] ): data_schema = (json_utils.dumps(target_meta.data_schema) if target_meta.data_schema is not None else None) target_info = LogTargetArgs( run_uid=task_run.run.run_uid, task_run_uid=task_run.task_run_uid, task_run_name=task_run.job_name, task_run_attempt_uid=task_run.task_run_attempt_uid, task_def_uid=task_def_uid, param_name=param_name, target_path=str(target), operation_type=operation_type, operation_status=operation_status, value_preview=target_meta.value_preview, data_dimensions=target_meta.data_dimensions, data_schema=data_schema, data_hash=target_meta.data_hash, ) res = self.log_targets(targets_info=[target_info]) if target_meta.histograms: self.log_histograms(task_run, param_name, target_meta, utcnow()) return res
def log_exception_to_server(exception=None): try: from dbnd._core.current import get_databand_context client = get_databand_context().databand_api_client if client is None or not client.is_configured(): return e_type, e_value, e_traceback = sys.exc_info() if exception: e_type, e_value, e_traceback = ( type(exception), exception, exception.__traceback__, ) trace = _format_exception(e_type, e_value, e_traceback) data = { "dbnd_version": dbnd.__version__, "source": "tracking-sdk", "stack_trace": trace, "timestamp": utcnow().isoformat(), } return client.api_request(endpoint="log_exception", method="POST", data=data) except Exception: # noqa logger.debug("Error sending monitoring exception message", exc_info=True)
def stop(self): if not self._active: return self._active = False try: databand_run = self._run root_tr = self._task_run root_tr.finished_time = utcnow() if root_tr.task_run_state not in TaskRunState.finished_states(): for tr in databand_run.task_runs: if tr.task_run_state == TaskRunState.FAILED: root_tr.set_task_run_state(TaskRunState.UPSTREAM_FAILED) break else: root_tr.set_task_run_state(TaskRunState.SUCCESS) if root_tr.task_run_state == TaskRunState.SUCCESS: databand_run.set_run_state(RunState.SUCCESS) else: databand_run.set_run_state(RunState.FAILED) # todo: hard to control the console output if we printing to the console not from the console tracker if not CoreConfig.current().silence_tracking_mode: logger.info(databand_run.describe.run_banner_for_finished()) self._close_all_context_managers() except Exception as ex: _handle_tracking_error("dbnd-tracking-shutdown")
def log_dataset( self, task_run, # type: TaskRun operation_path, # type: Union[Target, str] data_meta, # type: ValueMeta operation_type, # type: DbndDatasetOperationType operation_status, # type: DbndTargetOperationStatus operation_error, # type: str with_partition=None, # type: Optional[bool] ): dataset_info = LogDatasetArgs( run_uid=task_run.run.run_uid, task_run_uid=task_run.task_run_uid, task_run_name=task_run.job_name, task_run_attempt_uid=task_run.task_run_attempt_uid, operation_path=str(operation_path), operation_type=operation_type, operation_status=operation_status, operation_error=operation_error, value_preview=data_meta.value_preview, columns_stats=data_meta.columns_stats, data_dimensions=data_meta.data_dimensions, data_schema=data_meta.data_schema, query=data_meta.query, with_partition=with_partition, timestamp=utcnow(), dataset_uri=None, ) res = self.log_datasets(datasets_info=[dataset_info]) return res
def export_db( archive, include_db=True, include_logs=True, task_version=utcnow().strftime("%Y%m%d_%H%M%S"), ): # type: (Path, bool, bool, str)-> None from dbnd._core.current import get_databand_context logger.info("Compressing files to %s..." % archive) with tarfile.open(str(archive), "w:gz") as tar: if include_db: dbnd_context = get_databand_context() conn_string = dbnd_context.config.get("webserver", "sql_alchemy_conn") if conn_string.startswith("postgresql"): with tempfile.NamedTemporaryFile(prefix="dbdump.", suffix=".sql") as tf: dump_postgres(conn_string, tf.name) tar.add(tf.name, arcname="postgres-dbnd.sql") else: raise DatabandRuntimeError( "Can not export db! " "Currently, we support only sqlite and postgres db in automatic export" ) if include_logs: context = get_databand_context() local_env = context.settings.get_env_config(CloudType.local) logs_folder = local_env.dbnd_local_root.folder("logs").path if os.path.exists(logs_folder): logger.info("Adding run folder from '%s'", logs_folder) tar.add(logs_folder, "run") else: logger.warning("Logs dir '%s' is not found", logs_folder)
def wrapped(obj, *args, **kwargs): obj_logger = getattr(obj.__module__, "logger", None) or logging.getLogger(obj.__module__) try: obj_logger.debug("[%s] %s", obj, message or f.__name__) result = f(obj, *args, **kwargs) _report_error(obj, f, None) return result except Exception as e: obj_logger.exception("[%s] Error during %s", obj, message or f.__name__) err_message = traceback.format_exc() if self.configuration_service_provider: configuration_service = self.configuration_service_provider( ) else: configuration_service = _get_tracking_service(obj) _log_exception_to_server(err_message, configuration_service) if isinstance(e, DatabandError): err_message = logger_format_for_databand_error(e) err_message += "\nTimestamp: {}".format(utcnow()) _report_error(obj, f, err_message)
def process_target(target, name): # type: (Target, str) -> None target_path = str(target) dbnd_target = targets.get(target_path) if not dbnd_target: # we see this target for the first time target_task_run_uid = ( None) # let assume that Target is now owned by any task # let try to find it's owner, so we create target that relates to some Task # if `task` is pipeline, the target owner is going to be different task if target.task: target_task_run = run.get_task_run(target.task.task_id) if target_task_run: target_task_run_uid = target_task_run.task_run_uid dbnd_target = targets[target_path] = TargetInfo( path=target_path, created_date=utcnow(), task_run_uid=target_task_run_uid, parameter_name=name, ) logger.debug( "New Target: %s -> %s -> %s", target.task, target_task_run_uid, target_path, ) task_targets.append(dbnd_target)
def _run_to_run_info(self): # type: () -> RunInfo run = self.run task = run.driver_task_run.task context = run.context env = run.env return RunInfo( run_uid=run.run_uid, job_name=run.job_name, user=context.task_run_env.user, name=run.name, state=RunState.RUNNING, start_time=utcnow(), end_time=None, description=run.description, is_archived=run.is_archived, env_name=env.name, cloud_type=env.cloud_type, # deprecate and airflow dag_id=run.dag_id, execution_date=run.execution_date, cmd_name=context.name, driver_name=env.remote_engine or env.local_engine, # move to task target_date=task.task_target_date, version=task.task_version, # root and submitted by root_run=run.root_run_info, scheduled_run=run.scheduled_run_info, trigger="unknown", sends_heartbeat=run.sends_heartbeat, task_executor=run.task_executor_type, )
def stop(self, finalize_run=True): if not self._active: return self._active = False try: # Required for scripts tracking which do not set the state to SUCCESS if finalize_run: databand_run = self._run root_tr = self._task_run root_tr.finished_time = utcnow() if root_tr.task_run_state not in TaskRunState.finished_states( ): for tr in databand_run.task_runs: if tr.task_run_state == TaskRunState.FAILED: root_tr.set_task_run_state( TaskRunState.UPSTREAM_FAILED) break else: # We can reach here in case of raising exception tracking stand alone python script if sys.exc_info()[1]: root_tr.set_task_run_state(TaskRunState.FAILED) else: root_tr.set_task_run_state(TaskRunState.SUCCESS) if root_tr.task_run_state == TaskRunState.SUCCESS: databand_run.set_run_state(RunState.SUCCESS) else: databand_run.set_run_state(RunState.FAILED) self._close_all_context_managers() except Exception: _handle_tracking_error("dbnd-tracking-shutdown")
def test_dbnd_pass_through_default(pandas_data_frame_on_disk, mock_channel_tracker): df, df_file = pandas_data_frame_on_disk some_date = utcnow().isoformat() r = task_pass_through_default.dbnd_run( str(df_file), some_date, expect_pass_through=False ) assert r.root_task.result.read() == str(df) _check_tracking_calls( mock_channel_tracker, { "init_run": 1, "add_task_runs": 1, "update_task_run_attempts": 4, # DAG start, driver start, task start, task finished "log_metrics": 3, # 1 data metric call, 2 marshalling data calls "log_targets": 2, # read input "data" dataframe, write result "save_task_run_log": 2, # task, driver "set_run_state": 2, # running, success }, ) _assert_tracked_params( mock_channel_tracker, task_pass_through_default, data=str( df_file ), # param value is file path, the DF value will be logged as log_targets dt=some_date.replace("+00:00", "").replace(":", ""), expect_pass_through=False, )
def calculate_since_value(since_now, since, sync_history, history_only, api_client, airflow_server_info): if since_now: final_since_value = utcnow() elif since: final_since_value = pendulum.parse(since, tz=pytz.UTC) elif sync_history or history_only: final_since_value = pendulum.datetime.min else: # Default mode try: get_sync_times_from_api(api_client, airflow_server_info) final_since_value = airflow_server_info.synced_to if final_since_value: logger.info("Resuming sync from latest stop at: %s" % (final_since_value, )) else: logger.info( "Latest sync stop not found. Starting sync from the beginning" ) except Exception: logger.info( "Could not locate latest sync stop. Starting Airflow Monitor syncing from the beginning." ) final_since_value = pendulum.datetime.min return final_since_value
def stop(self): if self._stoped: return try: databand_run = self._run root_tr = self._task_run root_tr.finished_time = utcnow() if root_tr.task_run_state not in TaskRunState.finished_states(): for tr in databand_run.task_runs: if tr.task_run_state == TaskRunState.FAILED: root_tr.set_task_run_state( TaskRunState.UPSTREAM_FAILED) databand_run.set_run_state(RunState.FAILED) break else: root_tr.set_task_run_state(TaskRunState.SUCCESS) if root_tr.task_run_state == TaskRunState.SUCCESS: databand_run.set_run_state(RunState.SUCCESS) else: databand_run.set_run_state(RunState.FAILED) logger.info(databand_run.describe.run_banner_for_finished()) self._close_all_context_managers() except: _handle_inline_error("dbnd-tracking-shutdown") finally: self._stoped = True
def test_tracking_pass_through_default_tracking( pandas_data_frame_on_disk, mock_channel_tracker ): df, df_file = pandas_data_frame_on_disk # we'll pass string instead of defined expected DataFrame and it should work some_date = utcnow().isoformat() task_result = task_pass_through_default( str(df_file), some_date, expect_pass_through=True ) assert task_result == str(df_file) # this should happen on process exit in normal circumstances dbnd_tracking_stop() _check_tracking_calls( mock_channel_tracker, { "init_run": 1, "add_task_runs": 1, "log_metrics": 1, "log_targets": 1, "save_task_run_log": 2, "set_run_state": 1, "update_task_run_attempts": 4, # DAG start(with task start), task started running, task finished, dag stop }, ) _assert_tracked_params( mock_channel_tracker, task_pass_through_default, data=str(df_file), dt=some_date, expect_pass_through=True, )
def do_incomplete_data_fetching_iteration( airflow_config, airflow_instance_detail, api_client, tracking_store, incomplete_offset, ): """ Fetch incomplete data from Airflow web server, return number of items fetched """ # Max time to look for incomplete data, we do not update this but use pagination instead since = utcnow() - timedelta( days=airflow_config.oldest_incomplete_data_in_days) data = try_fetching_from_airflow( airflow_instance_detail, airflow_config, since, incomplete_offset, api_client, False, ) if not validate_airflow_monitor_data(data, airflow_instance_detail, airflow_config, api_client): return 0 try: log_received_tasks(airflow_instance_detail.url, data) send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server" .format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), )) total_fetched = max(len(data["task_instances"]), len(data["dag_runs"])) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) dump_unsent_data(data) send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def set_task_run_state(self, task_run, state, error=None, timestamp=None): super(ConsoleStore, self).set_task_run_state(task_run=task_run, state=state) task = task_run.task if self._is_in_airflow_tracking_mode: if state == TaskRunState.RUNNING: logger.info("Tracking %s task at %s", task.task_name, task_run.task_tracker_url) return # optimize, don't print success banner for fast running tasks start_time = task_run.start_time or utcnow() quick_task = task_run.finished_time and ( task_run.finished_time - start_time) < timedelta(seconds=5) show_simple_log = not self.verbose and (task_run.task.task_is_system or quick_task) level = logging.INFO color = "cyan" task_friendly_id = task_run.task_af_id task_id_str = "%s(%s)" % (task.task_id, task_friendly_id) if state in [TaskRunState.RUNNING, TaskRunState.QUEUED]: task_msg = "Running task %s" % task_id_str elif state == TaskRunState.SUCCESS: task_msg = "Task %s has been completed!" % task_id_str color = "green" elif state == TaskRunState.FAILED: task_msg = "Task %s has failed!" % task_id_str color = "red" level = logging.WARNING if task_run.task.task_name in SystemTaskName.driver_and_submitter: show_simple_log = True elif state == TaskRunState.CANCELLED: task_msg = "Task %s has been canceled!" % task_id_str color = "red" level = logging.WARNING show_simple_log = True else: task_msg = "Task %s moved to %s state" % (task_friendly_id, state) if show_simple_log: logger.log(level, task_msg) return try: logger.log( level, task.ctrl.visualiser.banner(task_msg, color, task_run=task_run)) except Exception as ex: logger.log(level, "%s , failed to create banner: %s" % (task_msg, ex))
def _log_param(self, run_id, param): # type: (str, mlflow.entities.Param) -> None # Temporarly log params as metrics dbnd_metric = Metric(key=param.key, value=param.value, timestamp=utcnow()) self.dbnd_store.log_metrics( task_run=self._get_current_task_run(), metrics=[dbnd_metric] ) logger.info("Param {}".format(param))
def _build_snowflake_resource_usage_query( database: str, query_id: str, session_id: int, history_window: float, query_history_result_limit: int, config: SnowflakeConfig, ) -> str: time_end = utcnow() - timedelta(minutes=config.query_history_end_time_range_end) time_start = time_end - timedelta( minutes=history_window or config.query_history_end_time_range_start ) if session_id: query_history = dedent( """\ select {metrics} from table({database}.information_schema.query_history_by_session( SESSION_ID => {session_id}, END_TIME_RANGE_START => '{time_start}'::timestamp_ltz, END_TIME_RANGE_END => '{time_end}'::timestamp_ltz, RESULT_LIMIT => {result_limit} )) where query_id='{query_id}' order by start_time desc limit 1;""" ).format( metrics=RESOURCE_METRICS, database=database, minutes=history_window, session_id=session_id, result_limit=query_history_result_limit, time_start=time_start, time_end=time_end, query_id=query_id, ) return query_history else: query_history = dedent( """\ select {metrics} from table({database}.information_schema.query_history( END_TIME_RANGE_START => '{time_start}'::timestamp_ltz, END_TIME_RANGE_END => '{time_end}'::timestamp_ltz, RESULT_LIMIT => {result_limit} )) where query_id='{query_id}' order by start_time desc limit 1;""" ).format( metrics=RESOURCE_METRICS, database=database, minutes=history_window, result_limit=query_history_result_limit, time_start=time_start, time_end=time_end, query_id=query_id, ) return query_history
def _should_stop(self): if ( self.monitor_config.number_of_iterations and self.iteration >= self.monitor_config.number_of_iterations ): return True if self.stop_at and utcnow() >= self.stop_at: return True
def log_data( self, key, # type: str data, # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable] meta_conf, # type: ValueMetaConf path=None, # type: Optional[Union[Target,str]] operation_type=DbndTargetOperationType. read, # type: DbndTargetOperationType operation_status=DbndTargetOperationStatus. OK, # type: DbndTargetOperationStatus raise_on_error=False, # type: bool ): # type: (...) -> None try: # Combine meta_conf with the config settings meta_conf = self.settings.tracking.get_value_meta_conf(meta_conf) value_meta = get_value_meta_from_value(key, data, meta_conf, raise_on_error) if not value_meta: logger.warning( "Couldn't log the wanted data {name}, reason - can't log objects of type {value_type} " .format(name=key, value_type=type(data))) return ts = utcnow() if path: self.tracking_store.log_target( task_run=self.task_run, target=path, target_meta=value_meta, operation_type=operation_type, operation_status=operation_status, param_name=key, ) metrics = value_meta.build_metrics_for_key(key, meta_conf) if metrics["user"]: self._log_metrics(metrics["user"]) if metrics["histograms"]: self.tracking_store.log_histograms(task_run=self.task_run, key=key, value_meta=value_meta, timestamp=ts) if not (metrics["user"] or metrics["histograms"] or path): logger.info("No metrics to log_data(key={}, data={})".format( key, data)) except Exception as ex: log_exception( "Error occurred during log_dataframe for %s" % (key, ), ex, non_critical=not raise_on_error, ) if raise_on_error: raise
def heartbeat(self): if self._running: # Refresh _running config incaese sleep_interval changed self._running.refresh_config() if (self.last_heartbeat and (utcnow() - self.last_heartbeat).total_seconds() < self._running.sleep_interval - 1): return try: self._iteration += 1 logger.info("Running sync for %s", self) self._running.sync_once() except Exception: self._running = None raise finally: self.last_heartbeat = utcnow()
def report_exception(self, exception: str): data = { "dbnd_version": dbnd.__version__, "source": f"{self.monitor_type}_monitor", "stack_trace": exception, "timestamp": utcnow().isoformat(), } return self._api_client.api_request(endpoint="log_exception", method="POST", data=data)
def log_metrics(metrics_dict, source="user", timestamp=None): # type: (Dict[str, Any], str, datetime) -> None tracker = _get_tracker() if tracker: from dbnd._core.tracking.schemas.metrics import Metric from dbnd._core.utils.timezone import utcnow metrics = [ Metric(key=key, value=value, source=source, timestamp=timestamp or utcnow()) for key, value in metrics_dict.items() ] tracker._log_metrics(metrics)