def execute(self, context): self.log.info(f'Checking DataQuality for {self._table}') snowflake_hook = SnowflakeHook(self._conn_id) self.log.info(f'Running query: {self._query}') records = snowflake_hook.get_records(self._query) if len(records) < 1 or len(records[0] < 1): raise ValueError( f"Data quality check failed. {self._table} returned no results" ) logging.info(f'Passed Data Quality Check for {self._table}')
def execute(self, context): self.log.info(f'Checking HasRows for {self._table}') snowflake_hook = SnowflakeHook(self._conn_id) self.log.info(f'Running rcord count: {self._table}') records = snowflake_hook.get_records( f"SELECT COUNT(1) FROM {self._table}") if len(records) < 1 or len(records[0] < 1): raise ValueError( f"Data quality check failed. {self._table} returned no results" ) logging.info(f'Passed Has Rows Check for {self._table}')
def snowflake_db_monitor(**op_kwarg): snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn") with snowflake_query_tracker(database=DATABASE, schema=SCHEMA) as st: snowflake_tables = snowflake_hook.get_pandas_df(GET_COLUMNS) snowflake_shapes = DataFrame() snowflake_tables = snowflake_tables[snowflake_tables["schema_name"] == "{}".format(SCHEMA)] snowflake_shapes["column_count"] = snowflake_tables.groupby( "table_name").nunique("column_name")["column_name"] snowflake_shapes["table_name"] = snowflake_tables["table_name"].unique() table_row_info = {} snowflake_rows = snowflake_hook.get_records(GET_DB_ROW_INFO) for tablename, row_count in snowflake_rows: table_row_info[tablename] = row_count row_counts = list(table_row_info.values()) log_metric("Max table row count", max(row_counts)) log_metric("Min table row count", min(row_counts)) log_metric("Mean table row count", round(mean(row_counts), 2)) log_metric("Median table row count", median(row_counts)) snowflake_shapes["row_count"] = (snowflake_shapes["table_name"].map( table_row_info).fillna(0).astype(int)) for _, row in snowflake_shapes.iterrows(): log_metric( "{} shape".format(row["table_name"]), (row["column_count"], row["row_count"]), ) log_metric("Max table column count", snowflake_shapes["column_count"].max()) log_metric("Min table column count", snowflake_shapes["column_count"].max()) log_metric("Mean table column count", round(snowflake_shapes["column_count"].mean(), 2)) log_metric("Median table column count", snowflake_shapes["column_count"].median())
def process_customers(**kwargs): snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID) customers = snowflake_hook.get_records(select_query) # Process records process_records(customers)
def process_customers_with_monitoring(**kwargs): snowflake_hook = SnowflakeHook(snowflake_conn_id=SNOWFLAKE_CONNECTION_ID) with snowflake_query_tracker(log_tables=False, database=database): customers = snowflake_hook.get_records(select_query) # Process records - Same code process_records(customers)