Ejemplo n.º 1
0
def do_incomplete_data_fetching_iteration(
    airflow_config,
    airflow_instance_detail,
    api_client,
    tracking_store,
    incomplete_offset,
):
    """
    Fetch incomplete data from Airflow web server, return number of items fetched
    """
    # Max time to look for incomplete data, we do not update this but use pagination instead
    since = utcnow() - timedelta(
        days=airflow_config.oldest_incomplete_data_in_days)

    data = try_fetching_from_airflow(
        airflow_instance_detail,
        airflow_config,
        since,
        incomplete_offset,
        api_client,
        False,
    )

    if not validate_airflow_monitor_data(data, airflow_instance_detail,
                                         airflow_config, api_client):
        return 0

    try:
        log_received_tasks(airflow_instance_detail.url, data)
        send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server"
            .format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            ))

        total_fetched = max(len(data["task_instances"]), len(data["dag_runs"]))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        dump_unsent_data(data)
        send_exception_info(airflow_instance_detail, api_client,
                            airflow_config)
        return 0
Ejemplo n.º 2
0
 def make_run_info(self, data, **kwargs):
     return _as_dotted_dict(**data)
Ejemplo n.º 3
0
 def make_object(self, data, **kwargs):
     return _as_dotted_dict(**data)
Ejemplo n.º 4
0
 def make_task_definition_param(self, data, **kwargs):
     return _as_dotted_dict(**data)
Ejemplo n.º 5
0
def do_fetching_iteration(
    airflow_config, airflow_instance_detail, api_client, tracking_store
):
    """
    Fetch from Airflow webserver, return number of items fetched
    """
    try:
        log_fetching_parameters(
            airflow_instance_detail.url, airflow_instance_detail.since, airflow_config
        )

        data = airflow_instance_detail.data_fetcher.get_data(
            airflow_instance_detail.since,
            airflow_config.include_logs,
            airflow_config.include_task_args,
            airflow_config.include_xcom,
            airflow_config.dag_ids,
            airflow_config.fetch_quantity,
            None,
        )
    except JSONDecodeError:
        logger.exception("Could not decode the received data, error in json format.")
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
    except (ConnectionError, OSError, IOError) as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0

    if data is None:
        logger.warning("Didn't receive any data")
        return 0

    if "error" in data:
        logger.error("Error in Airflow Export Plugin: \n%s", data["error"])
        _save_error_message(
            airflow_instance_detail, data["error"], api_client, airflow_config
        )
        return 0

    try:
        _log_received_tasks(airflow_instance_detail.url, data)
        _send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        active_dags = {
            dag["dag_id"]: [task["task_id"] for task in dag["tasks"]]
            for dag in export_data.dags
            if dag.get("is_active", True)
        }
        logger.info("Got %d active DAGs", len(active_dags))

        airflow_instance_detail.update_airflow_server(
            airflow_version=export_data.airflow_version,
            dags_path=export_data.dags_path,
            logs_path=export_data.logs_path,
            airflow_export_version=export_data.airflow_export_version,
            synced_from=airflow_instance_detail.airflow_server_info.synced_from
            or airflow_instance_detail.since,
            active_dags=active_dags,
        )

        task_instances_end_dates = [
            pendulum.parse(str(t["end_date"]))
            for t in export_data.task_instances
            if t["end_date"] is not None
        ]

        dag_runs_end_dates = [
            pendulum.parse(str(dr["end_date"]))
            for dr in export_data.dag_runs
            if dr["end_date"] is not None
        ]
        logger.info(
            "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s",
            len(task_instances_end_dates),
            max(task_instances_end_dates) if task_instances_end_dates else None,
            len(dag_runs_end_dates),
            max(dag_runs_end_dates) if dag_runs_end_dates else None,
        )

        end_dates = (
            task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates
        )

        airflow_instance_detail.airflow_server_info.synced_to = (
            max(end_dates) if end_dates else utcnow()
        )
        logger.info(
            "Using last end date %s, New synced_to date is %s",
            max(end_dates) if end_dates else None,
            airflow_instance_detail.airflow_server_info.synced_to,
        )

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logging.info(
            "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}".format(
                airflow_instance_detail.airflow_server_info.base_url,
                airflow_instance_detail.airflow_server_info.synced_from,
                airflow_instance_detail.airflow_server_info.synced_to,
                airflow_instance_detail.airflow_server_info.last_sync_time,
            )
        )
        save_airflow_server_info(
            airflow_instance_detail.airflow_server_info, api_client
        )

        # If synced_to was set to utcnow(), keep since as it was
        if end_dates:
            logger.info(
                "Updating since, old value: {}, new value: {}".format(
                    airflow_instance_detail.since,
                    airflow_instance_detail.airflow_server_info.synced_to,
                )
            )
            airflow_instance_detail.since = (
                airflow_instance_detail.airflow_server_info.synced_to
            )
        else:
            logger.info(
                "Keeping since as it was {}".format(airflow_instance_detail.since)
            )

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server".format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            )
        )

        total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _dump_unsent_data(data)
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
Ejemplo n.º 6
0
def do_incomplete_data_fetching_iteration(
    airflow_config,
    airflow_instance_detail,
    api_client,
    tracking_store,
    incomplete_offset,
):
    """
    Fetch incomplete data from Airflow web server, return number of items fetched
    """
    exception_type, exception, exception_traceback = None, None, None

    # Max time to look for incomplete data, we do not update this but use pagination instead
    since = utcnow() - timedelta(days=airflow_config.oldest_incomplete_data_in_days)

    try:
        log_fetching_parameters(
            airflow_instance_detail.url, since, airflow_config, incomplete_offset,
        )

        data = airflow_instance_detail.data_fetcher.get_data(
            since,
            airflow_config.include_logs,
            airflow_config.include_task_args,
            airflow_config.include_xcom,
            airflow_config.dag_ids,
            airflow_config.fetch_quantity,
            incomplete_offset,
        )
    except JSONDecodeError:
        logger.exception("Could not decode the received data, error in json format.")
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0

    if data is None:
        logger.warning("Didn't receive any incomplete data")
        return 0

    if "error" in data:
        logger.error("Error in Airflow Export Plugin: \n%s", data["error"])
        _save_error_message(
            airflow_instance_detail, data["error"], api_client, airflow_config
        )
        return 0

    try:
        _log_received_tasks(airflow_instance_detail.url, data)
        _send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server".format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            )
        )

        total_fetched = max(len(data["task_instances"]), len(data["dag_runs"]))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _dump_unsent_data(data)
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
Ejemplo n.º 7
0
def do_fetching_iteration(airflow_config, airflow_instance_detail, api_client,
                          tracking_store):
    """
    Fetch from Airflow webserver, return number of items fetched
    """
    data = try_fetching_from_airflow(
        airflow_instance_detail,
        airflow_config,
        airflow_instance_detail.since,
        None,
        api_client,
        False,
    )

    if not validate_airflow_monitor_data(data, airflow_instance_detail,
                                         airflow_config, api_client):
        return 0

    try:
        log_received_tasks(airflow_instance_detail.url, data)
        send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        active_dags = {
            dag["dag_id"]: [task["task_id"] for task in dag["tasks"]]
            for dag in export_data.dags if dag.get("is_active", True)
        }
        logger.info("Got %d active DAGs", len(active_dags))

        airflow_instance_detail.update_airflow_server(
            airflow_version=export_data.airflow_version,
            dags_path=export_data.dags_path,
            logs_path=export_data.logs_path,
            airflow_export_version=export_data.airflow_export_version,
            synced_from=airflow_instance_detail.airflow_server_info.synced_from
            or airflow_instance_detail.since,
            active_dags=active_dags,
        )

        task_instances_end_dates = [
            pendulum.parse(str(t["end_date"]))
            for t in export_data.task_instances if t["end_date"] is not None
        ]

        dag_runs_end_dates = [
            pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs
            if dr["end_date"] is not None
        ]
        logger.info(
            "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s",
            len(task_instances_end_dates),
            max(task_instances_end_dates)
            if task_instances_end_dates else None,
            len(dag_runs_end_dates),
            max(dag_runs_end_dates) if dag_runs_end_dates else None,
        )

        end_dates = (task_instances_end_dates
                     if task_instances_end_dates else dag_runs_end_dates)

        airflow_instance_detail.airflow_server_info.synced_to = (
            max(end_dates) if end_dates else utcnow())
        logger.info(
            "Using last end date %s, New synced_to date is %s",
            max(end_dates) if end_dates else None,
            airflow_instance_detail.airflow_server_info.synced_to,
        )

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logging.info(
            "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}"
            .format(
                airflow_instance_detail.airflow_server_info.base_url,
                airflow_instance_detail.airflow_server_info.synced_from,
                airflow_instance_detail.airflow_server_info.synced_to,
                airflow_instance_detail.airflow_server_info.last_sync_time,
            ))
        save_airflow_server_info(airflow_instance_detail.airflow_server_info,
                                 api_client)

        # If synced_to was set to utcnow(), keep since as it was
        if end_dates:
            logger.info("Updating since, old value: {}, new value: {}".format(
                airflow_instance_detail.since,
                airflow_instance_detail.airflow_server_info.synced_to,
            ))
            airflow_instance_detail.since = (
                airflow_instance_detail.airflow_server_info.synced_to)
        else:
            logger.info("Keeping since as it was {}".format(
                airflow_instance_detail.since))

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server"
            .format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            ))

        total_fetched = max(len(task_instances_end_dates),
                            len(dag_runs_end_dates))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        dump_unsent_data(data)
        send_exception_info(airflow_instance_detail, api_client,
                            airflow_config)
        return 0