def calculate_since_value(since_now, since, sync_history, history_only, api_client, airflow_server_info): if since_now: final_since_value = utcnow() elif since: final_since_value = pendulum.parse(since, tz=pytz.UTC) elif sync_history or history_only: final_since_value = pendulum.datetime.min else: # Default mode try: get_sync_times_from_api(api_client, airflow_server_info) final_since_value = airflow_server_info.synced_to if final_since_value: logger.info("Resuming sync from latest stop at: %s" % (final_since_value, )) else: logger.info( "Latest sync stop not found. Starting sync from the beginning" ) except Exception: logger.info( "Could not locate latest sync stop. Starting Airflow Monitor syncing from the beginning." ) final_since_value = pendulum.datetime.min return final_since_value
def _set_tracking_config_overide(airflow_context=None): # Ceate proper DatabandContext so we can create other objects # There should be no Orchestrations tasks. # However, let's disable any orchestrations side effects config_for_tracking = { "run": { "skip_completed": False, "skip_completed_on_run": False, "validate_task_inputs": False, "validate_task_outputs": False, }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": True }, # do not save any outputs "core": { "tracker_raise_on_error": False }, # do not fail on tracker errors } if airflow_context: import pytz task_target_date = pendulum.parse(airflow_context.execution_date, tz=pytz.UTC).date() use_dbnd_log = override_airflow_log_system_for_tracking() if use_dbnd_log is not None: config_for_tracking["log"] = {"disabled": not use_dbnd_log} config_for_tracking["task"]["task_target_date"] = task_target_date return config.set_values( config_values=config_for_tracking, priority=ConfigValuePriority.OVERRIDE, source="dbnd_tracking_config", )
def set_tracking_config_overide(airflow_context=None, use_dbnd_log=None): # 1. create proper DatabandContext so we can create other objects track_with_cache = config.getboolean("run", "tracking_with_cache") config_for_tracking = { "run": { "skip_completed": track_with_cache, "skip_completed_on_run": track_with_cache, "validate_task_inputs": track_with_cache, "validate_task_outputs": track_with_cache, }, # we don't want to "check" as script is task_version="now" "task": { "task_in_memory_outputs": not track_with_cache }, # do not save any outputs "core": { "tracker_raise_on_error": False }, # do not fail on tracker errors } if airflow_context: import pytz task_target_date = pendulum.parse(airflow_context.execution_date, tz=pytz.UTC).date() use_dbnd_log = override_airflow_log_system_for_tracking() config_for_tracking["task"]["task_target_date"] = task_target_date if use_dbnd_log is not None: config_for_tracking["log"] = {"disabled": not use_dbnd_log} return config.set_values(config_values=config_for_tracking, override=True, source="dbnd_tracking_config")
def get_job_run_uid(dag_id, execution_date): if isinstance(execution_date, six.string_types): execution_date = pendulum.parse(execution_date) if isinstance(execution_date, datetime.datetime): execution_date = ( execution_date.replace(microsecond=0).astimezone(pytz.utc).isoformat() ) return uuid.uuid5(NAMESPACE_DBND_RUN, "{}:{}".format(dag_id, execution_date))
def parse_from_str(self, s): """ Parses a string to a :py:class:`~datetime.datetime`. """ try: v = datetime.datetime.strptime(s, self.date_format) return v.replace(tzinfo=pytz.UTC) except (ValueError): return pendulum.parse(s, tz=pytz.UTC)
def get_dag(self, dag_id, from_file_only=True, execution_date=None, session=None): """ :param dag_id: :param execution_date: if provided, we'll try to find specifc version of dag (using pickle) :param session: :return: """ from flask import has_request_context, request, session as flask_session # all legacy airflow code works just with dag_id, also, there are some calls that doesn't pass through execution_date if has_request_context(): execution_date = execution_date or request.args.get( "execution_date") # trick to store last execution date used for the next flask call if execution_date: logger.debug("Execution date saved to session: %s, %s", dag_id, execution_date) flask_session["ed_" + dag_id] = execution_date else: logger.debug("Execution date from previous session: %s", dag_id) execution_date = flask_session.get("ed_" + dag_id) if execution_date and execution_date != "undefined": # we are going to return most "active" dag dttm = pendulum.parse(execution_date) dag = self._get_pickled_dag_from_dagrun(dag_id=dag_id, execution_date=dttm, session=session) if dag: return dag # we don't have specific dag/execution date, we are trying to get in-memory version dag = super(DbndAirflowDagBag, self).get_dag(dag_id, from_file_only=from_file_only) if dag: return dag # let try to find it latest version in DB latest_execution = (session.query(TaskInstance.execution_date).filter( TaskInstance.task_id == dag_id).order_by( TaskInstance.execution_date.desc()).first()) if latest_execution: return self._get_pickled_dag_from_dagrun( dag_id=dag_id, execution_date=latest_execution.execution_date, session=session, ) return None
def get_job_run_uid(airflow_instance_uid, dag_id, execution_date): # TODO_CORE: change to source_instance_uid if isinstance(execution_date, six.string_types): execution_date = pendulum.parse(execution_date) if isinstance(execution_date, datetime.datetime): # Temporary fix for existing databases with uids without microseconds algo_threshold = config.get("webserver", "run_uid_execution_date_threshold") if algo_threshold and execution_date <= pendulum.parse(algo_threshold): execution_date = execution_date.replace(microsecond=0) execution_date = execution_date.astimezone(pytz.utc).isoformat() if airflow_instance_uid is None: return uuid.uuid5(NAMESPACE_DBND_RUN, "{}:{}".format(dag_id, execution_date)) else: return uuid.uuid5( NAMESPACE_DBND_RUN, "{}:{}:{}".format(airflow_instance_uid, dag_id, execution_date), )
def parse(string): """ Parse a time string and return an aware datetime :param string: time string """ return pendulum.parse(string, tz=TIMEZONE)
def __init__(self, af_context): # type: (AirflowTaskContext) -> None self.run_uid = get_job_run_uid( dag_id=af_context.root_dag_id, execution_date=af_context.execution_date) self.dag_id = af_context.dag_id # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_runtime_op_task_id = af_context.task_id self.af_operator_sync__task_run_uid = get_task_run_uid( self.run_uid, af_context.dag_id, af_runtime_op_task_id) # 1. create proper DatabandContext so we can create other objects set_tracking_config_overide( use_dbnd_log=override_airflow_log_system_for_tracking()) # create databand context with new_dbnd_context(name="airflow") as dc: # type: DatabandContext # now create "operator" task for current task_id, # we can't actually run it, we even don't know when it's going to finish # current execution is inside the operator, this is the only thing we know # STATE AFTER INIT: # AirflowOperator__runtime -> DAG__runtime task_target_date = pendulum.parse(af_context.execution_date, tz=pytz.UTC).date() # AIRFLOW OPERATOR RUNTIME af_runtime_op = AirflowOperatorRuntimeTask( task_family=task_name_for_runtime(af_runtime_op_task_id), dag_id=af_context.dag_id, execution_date=af_context.execution_date, task_target_date=task_target_date, task_version="%s:%s" % (af_runtime_op_task_id, af_context.execution_date), ) # this is the real operator uid, we need to connect to it with our "tracked" task, # so the moment monitor is on -> we can sync af_db_op_task_run_uid = get_task_run_uid(self.run_uid, af_context.dag_id, af_runtime_op_task_id) af_runtime_op.task_meta.extra_parents_task_run_uids.add( af_db_op_task_run_uid) af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId( af_context.dag_id) self.af_operator_runtime__task = af_runtime_op # AIRFLOW DAG RUNTIME self.af_dag_runtime__task = AirflowDagRuntimeTask( task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID), dag_id=af_context.root_dag_id, # <- ROOT DAG! execution_date=af_context.execution_date, task_target_date=task_target_date, ) _add_child(self.af_dag_runtime__task, self.af_operator_runtime__task) # this will create databand run with driver and root tasks. # we need the "root" task to be the same between different airflow tasks invocations # since in dbnd we must have single root task, so we create "dummy" task with dag_id name # create databand run # we will want to preserve with new_databand_run( context=dc, task_or_task_name=self.af_dag_runtime__task, run_uid=self.run_uid, existing_run=False, job_name=af_context.root_dag_id, send_heartbeat=False, # we don't send heartbeat in tracking source=UpdateSource.airflow_tracking, ) as dr: self.dr = dr dr._init_without_run() self.airflow_operator__task_run = dr.get_task_run_by_id( af_runtime_op.task_id)
def do_fetching_iteration( airflow_config, airflow_instance_detail, api_client, tracking_store ): """ Fetch from Airflow webserver, return number of items fetched """ try: log_fetching_parameters( airflow_instance_detail.url, airflow_instance_detail.since, airflow_config ) data = airflow_instance_detail.data_fetcher.get_data( airflow_instance_detail.since, airflow_config.include_logs, airflow_config.include_task_args, airflow_config.include_xcom, airflow_config.dag_ids, airflow_config.fetch_quantity, None, ) except JSONDecodeError: logger.exception("Could not decode the received data, error in json format.") _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 except (ConnectionError, OSError, IOError) as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 if data is None: logger.warning("Didn't receive any data") return 0 if "error" in data: logger.error("Error in Airflow Export Plugin: \n%s", data["error"]) _save_error_message( airflow_instance_detail, data["error"], api_client, airflow_config ) return 0 try: _log_received_tasks(airflow_instance_detail.url, data) _send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) active_dags = { dag["dag_id"]: [task["task_id"] for task in dag["tasks"]] for dag in export_data.dags if dag.get("is_active", True) } logger.info("Got %d active DAGs", len(active_dags)) airflow_instance_detail.update_airflow_server( airflow_version=export_data.airflow_version, dags_path=export_data.dags_path, logs_path=export_data.logs_path, airflow_export_version=export_data.airflow_export_version, synced_from=airflow_instance_detail.airflow_server_info.synced_from or airflow_instance_detail.since, active_dags=active_dags, ) task_instances_end_dates = [ pendulum.parse(str(t["end_date"])) for t in export_data.task_instances if t["end_date"] is not None ] dag_runs_end_dates = [ pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs if dr["end_date"] is not None ] logger.info( "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s", len(task_instances_end_dates), max(task_instances_end_dates) if task_instances_end_dates else None, len(dag_runs_end_dates), max(dag_runs_end_dates) if dag_runs_end_dates else None, ) end_dates = ( task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates ) airflow_instance_detail.airflow_server_info.synced_to = ( max(end_dates) if end_dates else utcnow() ) logger.info( "Using last end date %s, New synced_to date is %s", max(end_dates) if end_dates else None, airflow_instance_detail.airflow_server_info.synced_to, ) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logging.info( "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}".format( airflow_instance_detail.airflow_server_info.base_url, airflow_instance_detail.airflow_server_info.synced_from, airflow_instance_detail.airflow_server_info.synced_to, airflow_instance_detail.airflow_server_info.last_sync_time, ) ) save_airflow_server_info( airflow_instance_detail.airflow_server_info, api_client ) # If synced_to was set to utcnow(), keep since as it was if end_dates: logger.info( "Updating since, old value: {}, new value: {}".format( airflow_instance_detail.since, airflow_instance_detail.airflow_server_info.synced_to, ) ) airflow_instance_detail.since = ( airflow_instance_detail.airflow_server_info.synced_to ) else: logger.info( "Keeping since as it was {}".format(airflow_instance_detail.since) ) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server".format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), ) ) total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates)) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _dump_unsent_data(data) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def do_fetching_iteration(airflow_config, airflow_instance_detail, api_client, tracking_store): """ Fetch from Airflow webserver, return number of items fetched """ data = try_fetching_from_airflow( airflow_instance_detail, airflow_config, airflow_instance_detail.since, None, api_client, False, ) if not validate_airflow_monitor_data(data, airflow_instance_detail, airflow_config, api_client): return 0 try: log_received_tasks(airflow_instance_detail.url, data) send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) active_dags = { dag["dag_id"]: [task["task_id"] for task in dag["tasks"]] for dag in export_data.dags if dag.get("is_active", True) } logger.info("Got %d active DAGs", len(active_dags)) airflow_instance_detail.update_airflow_server( airflow_version=export_data.airflow_version, dags_path=export_data.dags_path, logs_path=export_data.logs_path, airflow_export_version=export_data.airflow_export_version, synced_from=airflow_instance_detail.airflow_server_info.synced_from or airflow_instance_detail.since, active_dags=active_dags, ) task_instances_end_dates = [ pendulum.parse(str(t["end_date"])) for t in export_data.task_instances if t["end_date"] is not None ] dag_runs_end_dates = [ pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs if dr["end_date"] is not None ] logger.info( "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s", len(task_instances_end_dates), max(task_instances_end_dates) if task_instances_end_dates else None, len(dag_runs_end_dates), max(dag_runs_end_dates) if dag_runs_end_dates else None, ) end_dates = (task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates) airflow_instance_detail.airflow_server_info.synced_to = ( max(end_dates) if end_dates else utcnow()) logger.info( "Using last end date %s, New synced_to date is %s", max(end_dates) if end_dates else None, airflow_instance_detail.airflow_server_info.synced_to, ) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logging.info( "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}" .format( airflow_instance_detail.airflow_server_info.base_url, airflow_instance_detail.airflow_server_info.synced_from, airflow_instance_detail.airflow_server_info.synced_to, airflow_instance_detail.airflow_server_info.last_sync_time, )) save_airflow_server_info(airflow_instance_detail.airflow_server_info, api_client) # If synced_to was set to utcnow(), keep since as it was if end_dates: logger.info("Updating since, old value: {}, new value: {}".format( airflow_instance_detail.since, airflow_instance_detail.airflow_server_info.synced_to, )) airflow_instance_detail.since = ( airflow_instance_detail.airflow_server_info.synced_to) else: logger.info("Keeping since as it was {}".format( airflow_instance_detail.since)) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server" .format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), )) total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates)) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) dump_unsent_data(data) send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def parse_datetime(value, default=None): return pendulum.parse(value) if value else default