def calculate_since_value(since_now, since, sync_history, history_only,
                          api_client, airflow_server_info):
    if since_now:
        final_since_value = utcnow()
    elif since:
        final_since_value = pendulum.parse(since, tz=pytz.UTC)
    elif sync_history or history_only:
        final_since_value = pendulum.datetime.min
    else:
        # Default mode
        try:
            get_sync_times_from_api(api_client, airflow_server_info)
            final_since_value = airflow_server_info.synced_to
            if final_since_value:
                logger.info("Resuming sync from latest stop at: %s" %
                            (final_since_value, ))
            else:
                logger.info(
                    "Latest sync stop not found. Starting sync from the beginning"
                )
        except Exception:
            logger.info(
                "Could not locate latest sync stop. Starting Airflow Monitor syncing from the beginning."
            )
            final_since_value = pendulum.datetime.min

    return final_since_value
Beispiel #2
0
def _set_tracking_config_overide(airflow_context=None):
    # Ceate proper DatabandContext so we can create other objects
    # There should be no Orchestrations tasks.
    # However, let's disable any orchestrations side effects
    config_for_tracking = {
        "run": {
            "skip_completed": False,
            "skip_completed_on_run": False,
            "validate_task_inputs": False,
            "validate_task_outputs": False,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": True
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if airflow_context:
        import pytz

        task_target_date = pendulum.parse(airflow_context.execution_date,
                                          tz=pytz.UTC).date()
        use_dbnd_log = override_airflow_log_system_for_tracking()
        if use_dbnd_log is not None:
            config_for_tracking["log"] = {"disabled": not use_dbnd_log}

        config_for_tracking["task"]["task_target_date"] = task_target_date

    return config.set_values(
        config_values=config_for_tracking,
        priority=ConfigValuePriority.OVERRIDE,
        source="dbnd_tracking_config",
    )
Beispiel #3
0
def set_tracking_config_overide(airflow_context=None, use_dbnd_log=None):
    # 1. create proper DatabandContext so we can create other objects
    track_with_cache = config.getboolean("run", "tracking_with_cache")
    config_for_tracking = {
        "run": {
            "skip_completed": track_with_cache,
            "skip_completed_on_run": track_with_cache,
            "validate_task_inputs": track_with_cache,
            "validate_task_outputs": track_with_cache,
        },  # we don't want to "check" as script is task_version="now"
        "task": {
            "task_in_memory_outputs": not track_with_cache
        },  # do not save any outputs
        "core": {
            "tracker_raise_on_error": False
        },  # do not fail on tracker errors
    }
    if airflow_context:
        import pytz

        task_target_date = pendulum.parse(airflow_context.execution_date,
                                          tz=pytz.UTC).date()
        use_dbnd_log = override_airflow_log_system_for_tracking()
        config_for_tracking["task"]["task_target_date"] = task_target_date

    if use_dbnd_log is not None:
        config_for_tracking["log"] = {"disabled": not use_dbnd_log}
    return config.set_values(config_values=config_for_tracking,
                             override=True,
                             source="dbnd_tracking_config")
Beispiel #4
0
def get_job_run_uid(dag_id, execution_date):
    if isinstance(execution_date, six.string_types):
        execution_date = pendulum.parse(execution_date)
    if isinstance(execution_date, datetime.datetime):
        execution_date = (
            execution_date.replace(microsecond=0).astimezone(pytz.utc).isoformat()
        )
    return uuid.uuid5(NAMESPACE_DBND_RUN, "{}:{}".format(dag_id, execution_date))
Beispiel #5
0
 def parse_from_str(self, s):
     """
     Parses a string to a :py:class:`~datetime.datetime`.
     """
     try:
         v = datetime.datetime.strptime(s, self.date_format)
         return v.replace(tzinfo=pytz.UTC)
     except (ValueError):
         return pendulum.parse(s, tz=pytz.UTC)
    def get_dag(self,
                dag_id,
                from_file_only=True,
                execution_date=None,
                session=None):
        """
        :param dag_id:
        :param execution_date: if provided, we'll try to find specifc version of dag (using pickle)
        :param session:
        :return:
        """
        from flask import has_request_context, request, session as flask_session

        # all legacy airflow code works just with dag_id, also, there are some calls that doesn't pass through execution_date
        if has_request_context():
            execution_date = execution_date or request.args.get(
                "execution_date")

            # trick to store last execution date used for the next flask call
            if execution_date:
                logger.debug("Execution date saved to session: %s, %s", dag_id,
                             execution_date)
                flask_session["ed_" + dag_id] = execution_date
            else:
                logger.debug("Execution date from previous session: %s",
                             dag_id)
                execution_date = flask_session.get("ed_" + dag_id)

            if execution_date and execution_date != "undefined":
                # we are going to return most "active" dag
                dttm = pendulum.parse(execution_date)
                dag = self._get_pickled_dag_from_dagrun(dag_id=dag_id,
                                                        execution_date=dttm,
                                                        session=session)
                if dag:
                    return dag

        # we don't have specific dag/execution date, we are trying to get in-memory version
        dag = super(DbndAirflowDagBag,
                    self).get_dag(dag_id, from_file_only=from_file_only)
        if dag:
            return dag

        # let try to find it latest version in DB
        latest_execution = (session.query(TaskInstance.execution_date).filter(
            TaskInstance.task_id == dag_id).order_by(
                TaskInstance.execution_date.desc()).first())

        if latest_execution:
            return self._get_pickled_dag_from_dagrun(
                dag_id=dag_id,
                execution_date=latest_execution.execution_date,
                session=session,
            )

        return None
Beispiel #7
0
def get_job_run_uid(airflow_instance_uid, dag_id, execution_date):
    # TODO_CORE: change to source_instance_uid
    if isinstance(execution_date, six.string_types):
        execution_date = pendulum.parse(execution_date)
    if isinstance(execution_date, datetime.datetime):
        # Temporary fix for existing databases with uids without microseconds
        algo_threshold = config.get("webserver",
                                    "run_uid_execution_date_threshold")
        if algo_threshold and execution_date <= pendulum.parse(algo_threshold):
            execution_date = execution_date.replace(microsecond=0)
        execution_date = execution_date.astimezone(pytz.utc).isoformat()
    if airflow_instance_uid is None:
        return uuid.uuid5(NAMESPACE_DBND_RUN,
                          "{}:{}".format(dag_id, execution_date))
    else:
        return uuid.uuid5(
            NAMESPACE_DBND_RUN,
            "{}:{}:{}".format(airflow_instance_uid, dag_id, execution_date),
        )
Beispiel #8
0
def parse(string):
    """
    Parse a time string and return an aware datetime
    :param string: time string
    """
    return pendulum.parse(string, tz=TIMEZONE)
    def __init__(self, af_context):
        # type: (AirflowTaskContext) -> None
        self.run_uid = get_job_run_uid(
            dag_id=af_context.root_dag_id,
            execution_date=af_context.execution_date)
        self.dag_id = af_context.dag_id
        # this is the real operator uid, we need to connect to it with our "tracked" task,
        # so the moment monitor is on -> we can sync
        af_runtime_op_task_id = af_context.task_id
        self.af_operator_sync__task_run_uid = get_task_run_uid(
            self.run_uid, af_context.dag_id, af_runtime_op_task_id)
        # 1. create proper DatabandContext so we can create other objects
        set_tracking_config_overide(
            use_dbnd_log=override_airflow_log_system_for_tracking())

        # create databand context
        with new_dbnd_context(name="airflow") as dc:  # type: DatabandContext

            # now create "operator" task for current task_id,
            # we can't actually run it, we even don't know when it's going to finish
            # current execution is inside the operator, this is the only thing we know
            # STATE AFTER INIT:
            # AirflowOperator__runtime ->  DAG__runtime
            task_target_date = pendulum.parse(af_context.execution_date,
                                              tz=pytz.UTC).date()
            # AIRFLOW OPERATOR RUNTIME

            af_runtime_op = AirflowOperatorRuntimeTask(
                task_family=task_name_for_runtime(af_runtime_op_task_id),
                dag_id=af_context.dag_id,
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
                task_version="%s:%s" %
                (af_runtime_op_task_id, af_context.execution_date),
            )

            # this is the real operator uid, we need to connect to it with our "tracked" task,
            # so the moment monitor is on -> we can sync
            af_db_op_task_run_uid = get_task_run_uid(self.run_uid,
                                                     af_context.dag_id,
                                                     af_runtime_op_task_id)
            af_runtime_op.task_meta.extra_parents_task_run_uids.add(
                af_db_op_task_run_uid)
            af_runtime_op.ctrl.force_task_run_uid = TaskRunUidGen_TaskAfId(
                af_context.dag_id)

            self.af_operator_runtime__task = af_runtime_op
            # AIRFLOW DAG RUNTIME
            self.af_dag_runtime__task = AirflowDagRuntimeTask(
                task_name=task_name_for_runtime(DAG_SPECIAL_TASK_ID),
                dag_id=af_context.root_dag_id,  # <- ROOT DAG!
                execution_date=af_context.execution_date,
                task_target_date=task_target_date,
            )
            _add_child(self.af_dag_runtime__task,
                       self.af_operator_runtime__task)

            # this will create databand run with driver and root tasks.
            # we need the "root" task to be the same between different airflow tasks invocations
            # since in dbnd we must have single root task, so we create "dummy" task with dag_id name

            # create databand run
            # we will want to preserve
            with new_databand_run(
                    context=dc,
                    task_or_task_name=self.af_dag_runtime__task,
                    run_uid=self.run_uid,
                    existing_run=False,
                    job_name=af_context.root_dag_id,
                    send_heartbeat=False,  # we don't send heartbeat in tracking
                    source=UpdateSource.airflow_tracking,
            ) as dr:
                self.dr = dr
                dr._init_without_run()
                self.airflow_operator__task_run = dr.get_task_run_by_id(
                    af_runtime_op.task_id)
Beispiel #10
0
def do_fetching_iteration(
    airflow_config, airflow_instance_detail, api_client, tracking_store
):
    """
    Fetch from Airflow webserver, return number of items fetched
    """
    try:
        log_fetching_parameters(
            airflow_instance_detail.url, airflow_instance_detail.since, airflow_config
        )

        data = airflow_instance_detail.data_fetcher.get_data(
            airflow_instance_detail.since,
            airflow_config.include_logs,
            airflow_config.include_task_args,
            airflow_config.include_xcom,
            airflow_config.dag_ids,
            airflow_config.fetch_quantity,
            None,
        )
    except JSONDecodeError:
        logger.exception("Could not decode the received data, error in json format.")
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
    except (ConnectionError, OSError, IOError) as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0

    if data is None:
        logger.warning("Didn't receive any data")
        return 0

    if "error" in data:
        logger.error("Error in Airflow Export Plugin: \n%s", data["error"])
        _save_error_message(
            airflow_instance_detail, data["error"], api_client, airflow_config
        )
        return 0

    try:
        _log_received_tasks(airflow_instance_detail.url, data)
        _send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        active_dags = {
            dag["dag_id"]: [task["task_id"] for task in dag["tasks"]]
            for dag in export_data.dags
            if dag.get("is_active", True)
        }
        logger.info("Got %d active DAGs", len(active_dags))

        airflow_instance_detail.update_airflow_server(
            airflow_version=export_data.airflow_version,
            dags_path=export_data.dags_path,
            logs_path=export_data.logs_path,
            airflow_export_version=export_data.airflow_export_version,
            synced_from=airflow_instance_detail.airflow_server_info.synced_from
            or airflow_instance_detail.since,
            active_dags=active_dags,
        )

        task_instances_end_dates = [
            pendulum.parse(str(t["end_date"]))
            for t in export_data.task_instances
            if t["end_date"] is not None
        ]

        dag_runs_end_dates = [
            pendulum.parse(str(dr["end_date"]))
            for dr in export_data.dag_runs
            if dr["end_date"] is not None
        ]
        logger.info(
            "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s",
            len(task_instances_end_dates),
            max(task_instances_end_dates) if task_instances_end_dates else None,
            len(dag_runs_end_dates),
            max(dag_runs_end_dates) if dag_runs_end_dates else None,
        )

        end_dates = (
            task_instances_end_dates if task_instances_end_dates else dag_runs_end_dates
        )

        airflow_instance_detail.airflow_server_info.synced_to = (
            max(end_dates) if end_dates else utcnow()
        )
        logger.info(
            "Using last end date %s, New synced_to date is %s",
            max(end_dates) if end_dates else None,
            airflow_instance_detail.airflow_server_info.synced_to,
        )

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logging.info(
            "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}".format(
                airflow_instance_detail.airflow_server_info.base_url,
                airflow_instance_detail.airflow_server_info.synced_from,
                airflow_instance_detail.airflow_server_info.synced_to,
                airflow_instance_detail.airflow_server_info.last_sync_time,
            )
        )
        save_airflow_server_info(
            airflow_instance_detail.airflow_server_info, api_client
        )

        # If synced_to was set to utcnow(), keep since as it was
        if end_dates:
            logger.info(
                "Updating since, old value: {}, new value: {}".format(
                    airflow_instance_detail.since,
                    airflow_instance_detail.airflow_server_info.synced_to,
                )
            )
            airflow_instance_detail.since = (
                airflow_instance_detail.airflow_server_info.synced_to
            )
        else:
            logger.info(
                "Keeping since as it was {}".format(airflow_instance_detail.since)
            )

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server".format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            )
        )

        total_fetched = max(len(task_instances_end_dates), len(dag_runs_end_dates))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        _dump_unsent_data(data)
        _send_exception_info(airflow_instance_detail, api_client, airflow_config)
        return 0
Beispiel #11
0
def do_fetching_iteration(airflow_config, airflow_instance_detail, api_client,
                          tracking_store):
    """
    Fetch from Airflow webserver, return number of items fetched
    """
    data = try_fetching_from_airflow(
        airflow_instance_detail,
        airflow_config,
        airflow_instance_detail.since,
        None,
        api_client,
        False,
    )

    if not validate_airflow_monitor_data(data, airflow_instance_detail,
                                         airflow_config, api_client):
        return 0

    try:
        log_received_tasks(airflow_instance_detail.url, data)
        send_metrics(airflow_instance_detail, data)

        export_data = _as_dotted_dict(**data)

        active_dags = {
            dag["dag_id"]: [task["task_id"] for task in dag["tasks"]]
            for dag in export_data.dags if dag.get("is_active", True)
        }
        logger.info("Got %d active DAGs", len(active_dags))

        airflow_instance_detail.update_airflow_server(
            airflow_version=export_data.airflow_version,
            dags_path=export_data.dags_path,
            logs_path=export_data.logs_path,
            airflow_export_version=export_data.airflow_export_version,
            synced_from=airflow_instance_detail.airflow_server_info.synced_from
            or airflow_instance_detail.since,
            active_dags=active_dags,
        )

        task_instances_end_dates = [
            pendulum.parse(str(t["end_date"]))
            for t in export_data.task_instances if t["end_date"] is not None
        ]

        dag_runs_end_dates = [
            pendulum.parse(str(dr["end_date"])) for dr in export_data.dag_runs
            if dr["end_date"] is not None
        ]
        logger.info(
            "Got %d task end dates, the last is %s and got %d dag run end dates, the last is %s",
            len(task_instances_end_dates),
            max(task_instances_end_dates)
            if task_instances_end_dates else None,
            len(dag_runs_end_dates),
            max(dag_runs_end_dates) if dag_runs_end_dates else None,
        )

        end_dates = (task_instances_end_dates
                     if task_instances_end_dates else dag_runs_end_dates)

        airflow_instance_detail.airflow_server_info.synced_to = (
            max(end_dates) if end_dates else utcnow())
        logger.info(
            "Using last end date %s, New synced_to date is %s",
            max(end_dates) if end_dates else None,
            airflow_instance_detail.airflow_server_info.synced_to,
        )

        save_airflow_monitor_data(
            data,
            tracking_store,
            airflow_instance_detail.url,
            airflow_instance_detail.airflow_server_info.last_sync_time,
        )

        logging.info(
            "Sending airflow server info: url={}, synced_from={}, synced_to={}, last_sync_time={}"
            .format(
                airflow_instance_detail.airflow_server_info.base_url,
                airflow_instance_detail.airflow_server_info.synced_from,
                airflow_instance_detail.airflow_server_info.synced_to,
                airflow_instance_detail.airflow_server_info.last_sync_time,
            ))
        save_airflow_server_info(airflow_instance_detail.airflow_server_info,
                                 api_client)

        # If synced_to was set to utcnow(), keep since as it was
        if end_dates:
            logger.info("Updating since, old value: {}, new value: {}".format(
                airflow_instance_detail.since,
                airflow_instance_detail.airflow_server_info.synced_to,
            ))
            airflow_instance_detail.since = (
                airflow_instance_detail.airflow_server_info.synced_to)
        else:
            logger.info("Keeping since as it was {}".format(
                airflow_instance_detail.since))

        logger.info(
            "Total {} task instances, {} dag runs, {} dags saved to databand web server"
            .format(
                len(export_data.task_instances),
                len(export_data.dag_runs),
                len(export_data.dags),
            ))

        total_fetched = max(len(task_instances_end_dates),
                            len(dag_runs_end_dates))
        return total_fetched
    except Exception as e:
        logger.exception(
            "An error occurred while trying to sync data from airflow to databand: %s",
            e,
        )
        dump_unsent_data(data)
        send_exception_info(airflow_instance_detail, api_client,
                            airflow_config)
        return 0
Beispiel #12
0
def parse_datetime(value, default=None):
    return pendulum.parse(value) if value else default