def execute(self, context): hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id) with snowflake_query_tracker(database=database) as st: hook.run(select_query) session_id, query_id = st.get_last_session_with_query_id( many=False) context["ti"].xcom_push(key="session_id", value=session_id) context["ti"].xcom_push(key="query_id", value=query_id)
def snowflake_db_monitor(**op_kwarg): snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn") with snowflake_query_tracker(database=DATABASE, schema=SCHEMA) as st: snowflake_tables = snowflake_hook.get_pandas_df(GET_COLUMNS) snowflake_shapes = DataFrame() snowflake_tables = snowflake_tables[snowflake_tables["schema_name"] == "{}".format(SCHEMA)] snowflake_shapes["column_count"] = snowflake_tables.groupby( "table_name").nunique("column_name")["column_name"] snowflake_shapes["table_name"] = snowflake_tables["table_name"].unique() table_row_info = {} snowflake_rows = snowflake_hook.get_records(GET_DB_ROW_INFO) for tablename, row_count in snowflake_rows: table_row_info[tablename] = row_count row_counts = list(table_row_info.values()) log_metric("Max table row count", max(row_counts)) log_metric("Min table row count", min(row_counts)) log_metric("Mean table row count", round(mean(row_counts), 2)) log_metric("Median table row count", median(row_counts)) snowflake_shapes["row_count"] = (snowflake_shapes["table_name"].map( table_row_info).fillna(0).astype(int)) for _, row in snowflake_shapes.iterrows(): log_metric( "{} shape".format(row["table_name"]), (row["column_count"], row["row_count"]), ) log_metric("Max table column count", snowflake_shapes["column_count"].max()) log_metric("Min table column count", snowflake_shapes["column_count"].max()) log_metric("Mean table column count", round(snowflake_shapes["column_count"].mean(), 2)) log_metric("Median table column count", snowflake_shapes["column_count"].median())
def snowflake_table_monitor(**context): full_table_path = context["target_table"] database, schema, table = full_table_path.split(".") snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn") with snowflake_query_tracker(database=database, schema=schema) as snowflake_qt: record_count = get_record_count(snowflake_hook, database, table) log_metric("records", record_count) col_metadata = get_column_info(snowflake_hook, database, schema, table) log_metric("column metadata", col_metadata) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: data = get_random_sample(snowflake_hook, database, schema, table, SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) log_snowflake_table(table, connection_string=snowflake_hook.get_uri(), database=database, schema=schema, with_preview=True, with_schema=True) # get difference between last known state of table and the current state table_delta = 0 column_diff = [] try: previous_record_count = Variable.get( "{}_record_cnt".format(full_table_path)) table_delta = previous_record_count - record_count previous_col_names = Variable.get( "{}_column_names".format(full_table_path)) column_diff = list( set(previous_col_names) - set(col_metadata['column_names'])) except: pass col_changed = True if column_diff else False log_metric("table_delta", table_delta) log_metric("columns_changed", col_changed) Variable.set("{}_record_cnt".format(full_table_path), record_count) Variable.set("{}_column_names".format(full_table_path), col_metadata['column_names']) # log metrics of the sampled data (if sampled) if ENABLE_SNOWFLAKE_TABLE_SAMPLE: log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB) for column in data.columns: log_metric("{} null record count".format(column), int(data[column].isna().sum())) if issubdtype(data[column].dtype, number): log_metric("{} mean".format(column), round(data[column].mean(), 2)) log_metric("{} median".format(column), data[column].median()) log_metric("{} min".format(column), data[column].min()) log_metric("{} max".format(column), data[column].max()) log_metric("{} std".format(column), round(data[column].std(), 2)) context['ti'].xcom_push(key="{}_table_delta".format(full_table_path), value=table_delta) context['ti'].xcom_push(key="{}_record_count".format(full_table_path), value=record_count)
def execute(self, context): hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id) with snowflake_query_tracker(database=database): hook.run(sql=select_query)
def update_customers_with_monitoring(**kwargs): snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID) with snowflake_query_tracker(log_tables=False, database=database): snowflake_hook.run(update_query)
def process_customers_with_monitoring(**kwargs): snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID) with snowflake_query_tracker(log_tables=False, database=database): customers = snowflake_hook.get_records(select_query) # Process records - Same code process_records(customers)