def do_incomplete_data_fetching_iteration( airflow_config, airflow_instance_detail, api_client, tracking_store, incomplete_offset, ): """ Fetch incomplete data from Airflow web server, return number of items fetched """ # Max time to look for incomplete data, we do not update this but use pagination instead since = utcnow() - timedelta( days=airflow_config.oldest_incomplete_data_in_days) data = try_fetching_from_airflow( airflow_instance_detail, airflow_config, since, incomplete_offset, api_client, False, ) if not validate_airflow_monitor_data(data, airflow_instance_detail, airflow_config, api_client): return 0 try: log_received_tasks(airflow_instance_detail.url, data) send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server" .format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), )) total_fetched = max(len(data["task_instances"]), len(data["dag_runs"])) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) dump_unsent_data(data) send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def make_run_info(self, data, **kwargs): return _as_dotted_dict(**data)
def make_object(self, data, **kwargs): return _as_dotted_dict(**data)
def make_task_definition_param(self, data, **kwargs): return _as_dotted_dict(**data)
def do_fetching_iteration( airflow_config, airflow_instance_detail, api_client, tracking_store ): """ Fetch from Airflow webserver, return number of items fetched """ try: log_fetching_parameters( airflow_instance_detail.url, airflow_instance_detail.since, airflow_config ) data = airflow_instance_detail.data_fetcher.get_data( airflow_instance_detail.since, airflow_config.include_logs, airflow_config.include_task_args, airflow_config.include_xcom, airflow_config.dag_ids, airflow_config.fetch_quantity, None, ) except JSONDecodeError: logger.exception("Could not decode the received data, error in json format.") _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 except (ConnectionError, OSError, IOError) as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 if data is None: logger.warning("Didn't receive any data") return 0 if "error" in data: logger.error("Error in Airflow Export Plugin: \n%s", data["error"]) _save_error_message( airflow_instance_detail, data["error"], api_client, airflow_config ) return 0 try: _log_received_tasks(airflow_instance_detail.url, data) _send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) active_dags = { dag["dag_id"]: [task["task_id"] for task in dag["tasks"]] for dag in export_data.dags if dag.get("is_active", True) } logger.info("Got %d active DAGs", len(active_dags)) airflow_instance_detail.update_airflow_server( airflow_version=export_data.airflow_version, dags_path=export_data.dags_path, logs_path=export_data.logs_path, airflow_export_version=export_data.airflow_export_version, synced_from=airflow_instance_detail.airflow_server_info.synced_from or airflow_instance_detail.since, active_dags=active_dags, ) task_instances_end_dates = [ pendulum.parse(str(t["end_date"])) for t in export_data.task_instances if t["end_date"] is not None ] dag_runs_end_dates = [ pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs if dr["end_date"] is not None ] logger.info( "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s", len(task_instances_end_dates), max(task_instances_end_dates) if task_instances_end_dates else None, len(dag_runs_end_dates), max(dag_runs_end_dates) if dag_runs_end_dates else None, ) end_dates = ( task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates ) airflow_instance_detail.airflow_server_info.synced_to = ( max(end_dates) if end_dates else utcnow() ) logger.info( "Using last end date %s, New synced_to date is %s", max(end_dates) if end_dates else None, airflow_instance_detail.airflow_server_info.synced_to, ) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logging.info( "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}".format( airflow_instance_detail.airflow_server_info.base_url, airflow_instance_detail.airflow_server_info.synced_from, airflow_instance_detail.airflow_server_info.synced_to, airflow_instance_detail.airflow_server_info.last_sync_time, ) ) save_airflow_server_info( airflow_instance_detail.airflow_server_info, api_client ) # If synced_to was set to utcnow(), keep since as it was if end_dates: logger.info( "Updating since, old value: {}, new value: {}".format( airflow_instance_detail.since, airflow_instance_detail.airflow_server_info.synced_to, ) ) airflow_instance_detail.since = ( airflow_instance_detail.airflow_server_info.synced_to ) else: logger.info( "Keeping since as it was {}".format(airflow_instance_detail.since) ) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server".format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), ) ) total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates)) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _dump_unsent_data(data) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def do_incomplete_data_fetching_iteration( airflow_config, airflow_instance_detail, api_client, tracking_store, incomplete_offset, ): """ Fetch incomplete data from Airflow web server, return number of items fetched """ exception_type, exception, exception_traceback = None, None, None # Max time to look for incomplete data, we do not update this but use pagination instead since = utcnow() - timedelta(days=airflow_config.oldest_incomplete_data_in_days) try: log_fetching_parameters( airflow_instance_detail.url, since, airflow_config, incomplete_offset, ) data = airflow_instance_detail.data_fetcher.get_data( since, airflow_config.include_logs, airflow_config.include_task_args, airflow_config.include_xcom, airflow_config.dag_ids, airflow_config.fetch_quantity, incomplete_offset, ) except JSONDecodeError: logger.exception("Could not decode the received data, error in json format.") _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0 if data is None: logger.warning("Didn't receive any incomplete data") return 0 if "error" in data: logger.error("Error in Airflow Export Plugin: \n%s", data["error"]) _save_error_message( airflow_instance_detail, data["error"], api_client, airflow_config ) return 0 try: _log_received_tasks(airflow_instance_detail.url, data) _send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server".format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), ) ) total_fetched = max(len(data["task_instances"]), len(data["dag_runs"])) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) _dump_unsent_data(data) _send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0
def do_fetching_iteration(airflow_config, airflow_instance_detail, api_client, tracking_store): """ Fetch from Airflow webserver, return number of items fetched """ data = try_fetching_from_airflow( airflow_instance_detail, airflow_config, airflow_instance_detail.since, None, api_client, False, ) if not validate_airflow_monitor_data(data, airflow_instance_detail, airflow_config, api_client): return 0 try: log_received_tasks(airflow_instance_detail.url, data) send_metrics(airflow_instance_detail, data) export_data = _as_dotted_dict(**data) active_dags = { dag["dag_id"]: [task["task_id"] for task in dag["tasks"]] for dag in export_data.dags if dag.get("is_active", True) } logger.info("Got %d active DAGs", len(active_dags)) airflow_instance_detail.update_airflow_server( airflow_version=export_data.airflow_version, dags_path=export_data.dags_path, logs_path=export_data.logs_path, airflow_export_version=export_data.airflow_export_version, synced_from=airflow_instance_detail.airflow_server_info.synced_from or airflow_instance_detail.since, active_dags=active_dags, ) task_instances_end_dates = [ pendulum.parse(str(t["end_date"])) for t in export_data.task_instances if t["end_date"] is not None ] dag_runs_end_dates = [ pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs if dr["end_date"] is not None ] logger.info( "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s", len(task_instances_end_dates), max(task_instances_end_dates) if task_instances_end_dates else None, len(dag_runs_end_dates), max(dag_runs_end_dates) if dag_runs_end_dates else None, ) end_dates = (task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates) airflow_instance_detail.airflow_server_info.synced_to = ( max(end_dates) if end_dates else utcnow()) logger.info( "Using last end date %s, New synced_to date is %s", max(end_dates) if end_dates else None, airflow_instance_detail.airflow_server_info.synced_to, ) save_airflow_monitor_data( data, tracking_store, airflow_instance_detail.url, airflow_instance_detail.airflow_server_info.last_sync_time, ) logging.info( "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}" .format( airflow_instance_detail.airflow_server_info.base_url, airflow_instance_detail.airflow_server_info.synced_from, airflow_instance_detail.airflow_server_info.synced_to, airflow_instance_detail.airflow_server_info.last_sync_time, )) save_airflow_server_info(airflow_instance_detail.airflow_server_info, api_client) # If synced_to was set to utcnow(), keep since as it was if end_dates: logger.info("Updating since, old value: {}, new value: {}".format( airflow_instance_detail.since, airflow_instance_detail.airflow_server_info.synced_to, )) airflow_instance_detail.since = ( airflow_instance_detail.airflow_server_info.synced_to) else: logger.info("Keeping since as it was {}".format( airflow_instance_detail.since)) logger.info( "Total {} task instances, {} dag runs, {} dags saved to databand web server" .format( len(export_data.task_instances), len(export_data.dag_runs), len(export_data.dags), )) total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates)) return total_fetched except Exception as e: logger.exception( "An error occurred while trying to sync data from airflow to databand: %s", e, ) dump_unsent_data(data) send_exception_info(airflow_instance_detail, api_client, airflow_config) return 0