Beispiel #1
0
    def execute(self, context):
        hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id)
        with snowflake_query_tracker(database=database) as st:
            hook.run(select_query)
            session_id, query_id = st.get_last_session_with_query_id(
                many=False)

        context["ti"].xcom_push(key="session_id", value=session_id)
        context["ti"].xcom_push(key="query_id", value=query_id)
Beispiel #2
0
def snowflake_db_monitor(**op_kwarg):
    snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn")

    with snowflake_query_tracker(database=DATABASE, schema=SCHEMA) as st:
        snowflake_tables = snowflake_hook.get_pandas_df(GET_COLUMNS)
        snowflake_shapes = DataFrame()
        snowflake_tables = snowflake_tables[snowflake_tables["schema_name"] ==
                                            "{}".format(SCHEMA)]

    snowflake_shapes["column_count"] = snowflake_tables.groupby(
        "table_name").nunique("column_name")["column_name"]
    snowflake_shapes["table_name"] = snowflake_tables["table_name"].unique()

    table_row_info = {}
    snowflake_rows = snowflake_hook.get_records(GET_DB_ROW_INFO)
    for tablename, row_count in snowflake_rows:
        table_row_info[tablename] = row_count

    row_counts = list(table_row_info.values())
    log_metric("Max table row count", max(row_counts))
    log_metric("Min table row count", min(row_counts))
    log_metric("Mean table row count", round(mean(row_counts), 2))
    log_metric("Median table row count", median(row_counts))

    snowflake_shapes["row_count"] = (snowflake_shapes["table_name"].map(
        table_row_info).fillna(0).astype(int))

    for _, row in snowflake_shapes.iterrows():
        log_metric(
            "{} shape".format(row["table_name"]),
            (row["column_count"], row["row_count"]),
        )

    log_metric("Max table column count",
               snowflake_shapes["column_count"].max())
    log_metric("Min table column count",
               snowflake_shapes["column_count"].max())
    log_metric("Mean table column count",
               round(snowflake_shapes["column_count"].mean(), 2))
    log_metric("Median table column count",
               snowflake_shapes["column_count"].median())
def snowflake_table_monitor(**context):
    full_table_path = context["target_table"]
    database, schema, table = full_table_path.split(".")
    snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn")
    with snowflake_query_tracker(database=database,
                                 schema=schema) as snowflake_qt:
        record_count = get_record_count(snowflake_hook, database, table)
        log_metric("records", record_count)

        col_metadata = get_column_info(snowflake_hook, database, schema, table)
        log_metric("column metadata", col_metadata)

        if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
            data = get_random_sample(snowflake_hook, database, schema, table,
                                     SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)

        log_snowflake_table(table,
                            connection_string=snowflake_hook.get_uri(),
                            database=database,
                            schema=schema,
                            with_preview=True,
                            with_schema=True)

    # get difference between last known state of table and the current state
    table_delta = 0
    column_diff = []
    try:
        previous_record_count = Variable.get(
            "{}_record_cnt".format(full_table_path))
        table_delta = previous_record_count - record_count

        previous_col_names = Variable.get(
            "{}_column_names".format(full_table_path))
        column_diff = list(
            set(previous_col_names) - set(col_metadata['column_names']))
    except:
        pass

    col_changed = True if column_diff else False
    log_metric("table_delta", table_delta)
    log_metric("columns_changed", col_changed)
    Variable.set("{}_record_cnt".format(full_table_path), record_count)
    Variable.set("{}_column_names".format(full_table_path),
                 col_metadata['column_names'])

    # log metrics of the sampled data (if sampled)
    if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
        log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)
        for column in data.columns:
            log_metric("{} null record count".format(column),
                       int(data[column].isna().sum()))

            if issubdtype(data[column].dtype, number):
                log_metric("{} mean".format(column),
                           round(data[column].mean(), 2))
                log_metric("{} median".format(column), data[column].median())
                log_metric("{} min".format(column), data[column].min())
                log_metric("{} max".format(column), data[column].max())
                log_metric("{} std".format(column),
                           round(data[column].std(), 2))

    context['ti'].xcom_push(key="{}_table_delta".format(full_table_path),
                            value=table_delta)
    context['ti'].xcom_push(key="{}_record_count".format(full_table_path),
                            value=record_count)
Beispiel #4
0
 def execute(self, context):
     hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id)
     with snowflake_query_tracker(database=database):
         hook.run(sql=select_query)
def update_customers_with_monitoring(**kwargs):
    snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID)
    with snowflake_query_tracker(log_tables=False, database=database):
        snowflake_hook.run(update_query)
def process_customers_with_monitoring(**kwargs):
    snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID)
    with snowflake_query_tracker(log_tables=False, database=database):
        customers = snowflake_hook.get_records(select_query)
    # Process records - Same code
    process_records(customers)