def log_xcom(context, track_config): task_instance = context["task_instance"] xcoms = get_xcoms(task_instance) if xcoms: # get only the first xcoms xcoms_head = islice(xcoms, track_config.max_xcom_length) log_metrics(dict(xcoms_head))
def log_xcom(context, track_config): task_instance = context["task_instance"] xcoms = get_xcoms(task_instance) if xcoms: # get only the first xcoms xcoms_head = islice(xcoms, track_config.max_xcom_length) # cut the size of too long xcom values shortened_xcoms = { key: value[: track_config.max_xcom_size] for key, value in xcoms_head } log_metrics(shortened_xcoms)
def onStageCompleted(self, stageCompleted): stage_info = stageCompleted.stageInfo() transformation_name = stage_info.name()[0:stage_info.name().index(" ")] metric_prefix = "stage-{}.{}".format(stage_info.stageId(), transformation_name) it = stage_info.taskMetrics().accumulators().iterator() metrics = {} while it.hasNext(): next_metric = it.next() key = "{}.{}".format(metric_prefix, next_metric.name().get()) value = next_metric.value() metrics[key] = value log_metrics(metrics, "spark")
def save(self, result_key, analyzer_context): if not self.java_libs_loaded: logger.warn( "Databand Java SDK was not loaded. Deequ metrics won't be logged" ) return df_name = ( self.jvm.scala.collection.JavaConverters.mapAsJavaMapConverter( result_key.tags()).asJava().getOrDefault("name", "data")) converter = self.jvm.ai.databand.deequ.DeequToDbnd( df_name, analyzer_context) metrics = dict(converter.metrics()) log_metrics(metrics)
def log_snowflake_resource_usage( database: str, connection_string: str, query_ids: List[str], session_id: Optional[int] = None, key: str = "snowflake_query", history_window: float = 15, query_history_result_limit: Optional[int] = None, retries: int = 3, retry_pause: float = 0, raise_on_error: bool = False, ) -> None: """ Search for a query previously executed by Snowflake in it's QUERY_HISTORY and log cpu time, run time, disk read, and other resources. Query's metadata can appear in QUERY_HISTORY with a lag up to 45 minutes. :param database: Name of the database query was issued to. :param connection_string: Snowflake connection string to use. :param query_ids: Supply a list of `query_id` generated by Snowflake for search in QUERY_HISTORY. :param session_id: Supply `session_id` generated by Snowflake for more efficient search in QUERY_HISTORY. :param key: Override it if you call this function twice or more within the same task/Airflow Operator :param history_window: How deep to search into QUERY_HISTORY. Set in minutes :param query_history_result_limit: Passed through directly to QUERY_HISTORY search function as `RESULT_LIMIT` param :param retries: How much times to search in QUERY_HISTORY. Each time search is widened by increasing `RESULT_LIMIT` param. :param raise_on_error: By default all exceptions are muted so your task success status is not affected by errors in tracking. Set to true to re-raise all exceptions. :param retry_pause: Set number of seconds to pause before next retry. """ snowflake_config = SnowflakeConfig() if query_history_result_limit is None: query_history_result_limit = snowflake_config.query_history_result_limit if not all(query_id for query_id in query_ids): error_msg = f"query_ids cannot be empty. You supplied: {query_ids}" if raise_on_error: raise SnowflakeError(error_msg) else: logger.error(error_msg) return metrics_to_log = {} # XXX: Do we actually need log_duration? with log_duration("log_snowflake_resource_usage__time_seconds", "system"): for i, query_id in enumerate(query_ids): query_key = f"{key}.{i}" if len(query_ids) > 1 else key metrics_to_log.update( _get_snowflake_resource_usage( database, connection_string, query_id, session_id, query_key, history_window, query_history_result_limit, retries, retry_pause, raise_on_error, snowflake_config, ) ) log_metrics(metrics_to_log, source="user")
def _log_snowflake_resource_usage( query_text, database, user, connection_string, session_id=None, ): # Quick and dirty way to handle optional clause element. # Might be better to use SQLAlchemy expression language here if session_id: query_history = dedent( """\ select * from table({}.information_schema.query_history(dateadd('minutes',-15,current_timestamp()),current_timestamp())) where LOWER(query_text)=LOWER(%s) and LOWER(user_name)=LOWER(%s) and session_id=%s order by start_time desc limit 1;""" ).format(database, session_id) query_params = (query_text, user, session_id) else: query_history = dedent( """\ select * from table({}.information_schema.query_history(dateadd('minutes',-15,current_timestamp()),current_timestamp())) where LOWER(query_text)=LOWER(%s) and LOWER(user_name)=LOWER(%s) order by start_time desc limit 1;""" ).format(database) query_params = (query_text, user) result = _connect_and_query(connection_string, query_history, *query_params) if not result: logger.info( "resource metrics were not found for query '%s', query_params=%s", query_text, query_params, ) log_metrics( { "snowflake_query_warning": "No resources info found", "snowflake_query_text": query_text, }, source="system", ) return metrics = result[0] key = "snowflake_query_{}".format( metrics["QUERY_TAG"] if metrics["QUERY_TAG"] else metrics["QUERY_ID"] ) snowflake_metric_to_ui_name = { "BYTES_SCANNED": "bytes_scanned", "COMPILATION_TIME": "compilation_time_milliseconds", "CREDITS_USED_CLOUD_SERVICES": "credits_used_cloud_services", "EXECUTION_TIME": "execution_time_milliseconds", "QUERY_TEXT": "query_text", "ROWS_PRODUCED": "rows_produced", "TOTAL_ELAPSED_TIME": "total_elapsed_time_milliseconds", } metrics_to_log = {} for metric, ui_name in snowflake_metric_to_ui_name.items(): if metric in metrics: value = metrics[metric] # Quick hack to track decimal values. probably should be handled on a serialization level if isinstance(value, Decimal): value = float(value) metrics_to_log[key + "." + ui_name] = value log_metrics(metrics_to_log, source="system")
def task_with_log_metrics(): # all lower alphabet chars -> {"a": 97,..., "z": 122} log_metrics({chr(i): i for i in range(97, 123)})