コード例 #1
0
ファイル: snowflake_tracker.py プロジェクト: databand-ai/dbnd
def log_all_snowflake_tables(cd, tables, reset=True, **kwargs):
    # type: (ConnectionData, Union[bool, List[str]], bool, ...) -> Optional[List[str]]
    """

    :param cd: ConnectionData object
    :param tables: if True - will log automatically discovered tables; if list of strings - will log only mentioned tables
    :param with_preview:
    :param with_schema:
    :param raise_on_error:
    :param reset: should reset the ConnectionData.tables once done
    :return:
    """
    if tables is True:
        tables = [table_op.name for table_op in cd.tables_ops]
    else:
        # don't reset if explicit tables provided
        reset = False

    if tables:
        for table in tables:
            parts = table.split(".")
            table_name = parts.pop()
            schema = parts.pop() if parts else cd.schema
            database = parts.pop() if parts else cd.database

            log_snowflake_table(
                table_name=table_name,
                schema=schema,
                database=database,
                connection_string=cd.connection,
                **kwargs,
            )
    if reset:
        cd.reset_tables()
    return tables
コード例 #2
0
 def execute(self, context):
     hook = self.get_hook()
     return log_snowflake_table(
         table_name=self.table,
         connection_string=hook.get_conn(),
         database=hook.database,
         schema=hook.schema,
         key=self.key,
         with_preview=self.with_preview,
         with_schema=self.with_schema,
         raise_on_error=self.raise_on_error,
     )
コード例 #3
0
    def execute(self, context):
        hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id)
        session_id, query_ids = snowflake_run(hook, select_query)
        connection_string = hook.get_uri()

        log_snowflake_table(
            table_name=table,
            connection_string=connection_string,
            database=database,
            schema=schema,
            key=f"example1.{table}",
            with_preview=False,
            raise_on_error=False,
        )

        log_snowflake_resource_usage(
            database=database,
            key=f"example1.{session_id}{query_ids[0]}",
            connection_string=connection_string,
            query_ids=query_ids,
            session_id=int(session_id),
            raise_on_error=True,
        )
コード例 #4
0
def snowflake_table_monitor(**context):
    full_table_path = context["target_table"]
    database, schema, table = full_table_path.split(".")
    snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn")
    with snowflake_query_tracker(database=database,
                                 schema=schema) as snowflake_qt:
        record_count = get_record_count(snowflake_hook, database, table)
        log_metric("records", record_count)

        col_metadata = get_column_info(snowflake_hook, database, schema, table)
        log_metric("column metadata", col_metadata)

        if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
            data = get_random_sample(snowflake_hook, database, schema, table,
                                     SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)

        log_snowflake_table(table,
                            connection_string=snowflake_hook.get_uri(),
                            database=database,
                            schema=schema,
                            with_preview=True,
                            with_schema=True)

    # get difference between last known state of table and the current state
    table_delta = 0
    column_diff = []
    try:
        previous_record_count = Variable.get(
            "{}_record_cnt".format(full_table_path))
        table_delta = previous_record_count - record_count

        previous_col_names = Variable.get(
            "{}_column_names".format(full_table_path))
        column_diff = list(
            set(previous_col_names) - set(col_metadata['column_names']))
    except:
        pass

    col_changed = True if column_diff else False
    log_metric("table_delta", table_delta)
    log_metric("columns_changed", col_changed)
    Variable.set("{}_record_cnt".format(full_table_path), record_count)
    Variable.set("{}_column_names".format(full_table_path),
                 col_metadata['column_names'])

    # log metrics of the sampled data (if sampled)
    if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
        log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)
        for column in data.columns:
            log_metric("{} null record count".format(column),
                       int(data[column].isna().sum()))

            if issubdtype(data[column].dtype, number):
                log_metric("{} mean".format(column),
                           round(data[column].mean(), 2))
                log_metric("{} median".format(column), data[column].median())
                log_metric("{} min".format(column), data[column].min())
                log_metric("{} max".format(column), data[column].max())
                log_metric("{} std".format(column),
                           round(data[column].std(), 2))

    context['ti'].xcom_push(key="{}_table_delta".format(full_table_path),
                            value=table_delta)
    context['ti'].xcom_push(key="{}_record_count".format(full_table_path),
                            value=record_count)