Beispiel #1
0
def log_xcom(context, track_config):
    task_instance = context["task_instance"]
    xcoms = get_xcoms(task_instance)
    if xcoms:
        # get only the first xcoms
        xcoms_head = islice(xcoms, track_config.max_xcom_length)
        log_metrics(dict(xcoms_head))
Beispiel #2
0
def log_xcom(context, track_config):
    task_instance = context["task_instance"]
    xcoms = get_xcoms(task_instance)
    if xcoms:
        # get only the first xcoms
        xcoms_head = islice(xcoms, track_config.max_xcom_length)
        # cut the size of too long xcom values
        shortened_xcoms = {
            key: value[: track_config.max_xcom_size] for key, value in xcoms_head
        }
        log_metrics(shortened_xcoms)
Beispiel #3
0
 def onStageCompleted(self, stageCompleted):
     stage_info = stageCompleted.stageInfo()
     transformation_name = stage_info.name()[0:stage_info.name().index(" ")]
     metric_prefix = "stage-{}.{}".format(stage_info.stageId(),
                                          transformation_name)
     it = stage_info.taskMetrics().accumulators().iterator()
     metrics = {}
     while it.hasNext():
         next_metric = it.next()
         key = "{}.{}".format(metric_prefix, next_metric.name().get())
         value = next_metric.value()
         metrics[key] = value
     log_metrics(metrics, "spark")
 def save(self, result_key, analyzer_context):
     if not self.java_libs_loaded:
         logger.warn(
             "Databand Java SDK was not loaded. Deequ metrics won't be logged"
         )
         return
     df_name = (
         self.jvm.scala.collection.JavaConverters.mapAsJavaMapConverter(
             result_key.tags()).asJava().getOrDefault("name", "data"))
     converter = self.jvm.ai.databand.deequ.DeequToDbnd(
         df_name, analyzer_context)
     metrics = dict(converter.metrics())
     log_metrics(metrics)
Beispiel #5
0
def log_snowflake_resource_usage(
    database: str,
    connection_string: str,
    query_ids: List[str],
    session_id: Optional[int] = None,
    key: str = "snowflake_query",
    history_window: float = 15,
    query_history_result_limit: Optional[int] = None,
    retries: int = 3,
    retry_pause: float = 0,
    raise_on_error: bool = False,
) -> None:
    """
    Search for a query previously executed by Snowflake in it's QUERY_HISTORY and log cpu time,
    run time, disk read, and other resources.

    Query's metadata can appear in QUERY_HISTORY with a lag up to 45 minutes.

    :param database: Name of the database query was issued to.
    :param connection_string: Snowflake connection string to use.
    :param query_ids: Supply a list of `query_id` generated by Snowflake for search in QUERY_HISTORY.
    :param session_id: Supply `session_id` generated by Snowflake for more efficient search in QUERY_HISTORY.
    :param key: Override it if you call this function twice or more within the same task/Airflow Operator
    :param history_window: How deep to search into QUERY_HISTORY. Set in minutes
    :param query_history_result_limit: Passed through directly to QUERY_HISTORY search function as `RESULT_LIMIT` param
    :param retries: How much times to search in QUERY_HISTORY.
        Each time search is widened by increasing `RESULT_LIMIT` param.
    :param raise_on_error: By default all exceptions are muted so your task success status
        is not affected by errors in tracking. Set to true to re-raise all exceptions.
    :param retry_pause: Set number of seconds to pause before next retry.
    """

    snowflake_config = SnowflakeConfig()
    if query_history_result_limit is None:
        query_history_result_limit = snowflake_config.query_history_result_limit

    if not all(query_id for query_id in query_ids):
        error_msg = f"query_ids cannot be empty. You supplied: {query_ids}"
        if raise_on_error:
            raise SnowflakeError(error_msg)
        else:
            logger.error(error_msg)
            return

    metrics_to_log = {}
    # XXX: Do we actually need log_duration?
    with log_duration("log_snowflake_resource_usage__time_seconds", "system"):
        for i, query_id in enumerate(query_ids):
            query_key = f"{key}.{i}" if len(query_ids) > 1 else key
            metrics_to_log.update(
                _get_snowflake_resource_usage(
                    database,
                    connection_string,
                    query_id,
                    session_id,
                    query_key,
                    history_window,
                    query_history_result_limit,
                    retries,
                    retry_pause,
                    raise_on_error,
                    snowflake_config,
                )
            )

    log_metrics(metrics_to_log, source="user")
def _log_snowflake_resource_usage(
    query_text, database, user, connection_string, session_id=None,
):
    # Quick and dirty way to handle optional clause element.
    # Might be better to use SQLAlchemy expression language here
    if session_id:
        query_history = dedent(
            """\
            select *
            from table({}.information_schema.query_history(dateadd('minutes',-15,current_timestamp()),current_timestamp()))
            where LOWER(query_text)=LOWER(%s) and LOWER(user_name)=LOWER(%s) and session_id=%s
            order by start_time desc limit 1;"""
        ).format(database, session_id)
        query_params = (query_text, user, session_id)
    else:
        query_history = dedent(
            """\
            select *
            from table({}.information_schema.query_history(dateadd('minutes',-15,current_timestamp()),current_timestamp()))
            where LOWER(query_text)=LOWER(%s) and LOWER(user_name)=LOWER(%s)
            order by start_time desc limit 1;"""
        ).format(database)
        query_params = (query_text, user)

    result = _connect_and_query(connection_string, query_history, *query_params)
    if not result:
        logger.info(
            "resource metrics were not found for query '%s', query_params=%s",
            query_text,
            query_params,
        )
        log_metrics(
            {
                "snowflake_query_warning": "No resources info found",
                "snowflake_query_text": query_text,
            },
            source="system",
        )
        return

    metrics = result[0]
    key = "snowflake_query_{}".format(
        metrics["QUERY_TAG"] if metrics["QUERY_TAG"] else metrics["QUERY_ID"]
    )
    snowflake_metric_to_ui_name = {
        "BYTES_SCANNED": "bytes_scanned",
        "COMPILATION_TIME": "compilation_time_milliseconds",
        "CREDITS_USED_CLOUD_SERVICES": "credits_used_cloud_services",
        "EXECUTION_TIME": "execution_time_milliseconds",
        "QUERY_TEXT": "query_text",
        "ROWS_PRODUCED": "rows_produced",
        "TOTAL_ELAPSED_TIME": "total_elapsed_time_milliseconds",
    }

    metrics_to_log = {}
    for metric, ui_name in snowflake_metric_to_ui_name.items():
        if metric in metrics:
            value = metrics[metric]
            # Quick hack to track decimal values. probably should be handled on a serialization level
            if isinstance(value, Decimal):
                value = float(value)
            metrics_to_log[key + "." + ui_name] = value
    log_metrics(metrics_to_log, source="system")
 def task_with_log_metrics():
     # all lower alphabet chars -> {"a": 97,..., "z": 122}
     log_metrics({chr(i): i for i in range(97, 123)})