def execute(self, context):
        snowflake_hook = SnowflakeHook(
            snowflake_conn_id=self.snowflake_conn_id)

        # Snowflake won't accept list of files it has to be tuple only.
        # but in python tuple([1]) = (1,) => which is invalid for snowflake
        files = str(self.s3_keys)
        files = files.replace('[', '(')
        files = files.replace(']', ')')

        # we can extend this based on stage
        base_sql = """
                    FROM @{stage}/
                    files={files}
                    file_format={file_format}
                """.format(stage=self.stage,
                           files=files,
                           file_format=self.file_format)

        if self.columns_array:
            copy_query = """
                COPY INTO {schema}.{table}({columns}) {base_sql}
            """.format(schema=self.schema,
                       table=self.table,
                       columns=",".join(self.columns_array),
                       base_sql=base_sql)
        else:
            copy_query = """
                COPY INTO {schema}.{table} {base_sql}
            """.format(schema=self.schema, table=self.table, base_sql=base_sql)

        self.log.info('Executing COPY command...')
        print(snowflake_hook.get_uri())
        snowflake_hook.run(copy_query, self.autocommit)
        self.log.info("COPY command completed")
def _log_snowflake_resources(
    query_text,
    snowflake_conn_id,
    session_id=None,
    warehouse=None,
    database=None,
    role=None,
    schema=None,
    account=None,
):
    hook = SnowflakeHook(
        snowflake_conn_id,
        warehouse=warehouse,
        account=account,
        database=database,
        role=role,
        schema=schema,
    )
    conn = hook.get_uri()
    conn_params = hook._get_conn_params()
    log_snowflake_resource_usage(
        query_text,
        database=hook.database,
        user=conn_params["user"],
        connection_string=conn,
        session_id=session_id,
    )
def _log_snowflake_table(
    table,
    snowflake_conn_id,
    warehouse=None,
    database=None,
    role=None,
    schema=None,
    account=None,
):
    hook = SnowflakeHook(
        snowflake_conn_id=snowflake_conn_id,
        warehouse=warehouse,
        account=account,
        database=database,
        role=role,
        schema=schema,
    )
    connection_string = hook.get_uri()
    return log_snowflake_table(table, connection_string, database, schema)
Example #4
0
    def execute(self, context):
        hook = SnowflakeHook(snowflake_conn_id=snowflake_conn_id)
        session_id, query_ids = snowflake_run(hook, select_query)
        connection_string = hook.get_uri()

        log_snowflake_table(
            table_name=table,
            connection_string=connection_string,
            database=database,
            schema=schema,
            key=f"example1.{table}",
            with_preview=False,
            raise_on_error=False,
        )

        log_snowflake_resource_usage(
            database=database,
            key=f"example1.{session_id}{query_ids[0]}",
            connection_string=connection_string,
            query_ids=query_ids,
            session_id=int(session_id),
            raise_on_error=True,
        )
def snowflake_table_monitor(**context):
    full_table_path = context["target_table"]
    database, schema, table = full_table_path.split(".")
    snowflake_hook = SnowflakeHook(snowflake_conn_id="test_snowflake_conn")
    with snowflake_query_tracker(database=database,
                                 schema=schema) as snowflake_qt:
        record_count = get_record_count(snowflake_hook, database, table)
        log_metric("records", record_count)

        col_metadata = get_column_info(snowflake_hook, database, schema, table)
        log_metric("column metadata", col_metadata)

        if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
            data = get_random_sample(snowflake_hook, database, schema, table,
                                     SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)

        log_snowflake_table(table,
                            connection_string=snowflake_hook.get_uri(),
                            database=database,
                            schema=schema,
                            with_preview=True,
                            with_schema=True)

    # get difference between last known state of table and the current state
    table_delta = 0
    column_diff = []
    try:
        previous_record_count = Variable.get(
            "{}_record_cnt".format(full_table_path))
        table_delta = previous_record_count - record_count

        previous_col_names = Variable.get(
            "{}_column_names".format(full_table_path))
        column_diff = list(
            set(previous_col_names) - set(col_metadata['column_names']))
    except:
        pass

    col_changed = True if column_diff else False
    log_metric("table_delta", table_delta)
    log_metric("columns_changed", col_changed)
    Variable.set("{}_record_cnt".format(full_table_path), record_count)
    Variable.set("{}_column_names".format(full_table_path),
                 col_metadata['column_names'])

    # log metrics of the sampled data (if sampled)
    if ENABLE_SNOWFLAKE_TABLE_SAMPLE:
        log_metric("sample_size(%)", SNOWFLAKE_TABLE_SAMPLE_ROW_PROB)
        for column in data.columns:
            log_metric("{} null record count".format(column),
                       int(data[column].isna().sum()))

            if issubdtype(data[column].dtype, number):
                log_metric("{} mean".format(column),
                           round(data[column].mean(), 2))
                log_metric("{} median".format(column), data[column].median())
                log_metric("{} min".format(column), data[column].min())
                log_metric("{} max".format(column), data[column].max())
                log_metric("{} std".format(column),
                           round(data[column].std(), 2))

    context['ti'].xcom_push(key="{}_table_delta".format(full_table_path),
                            value=table_delta)
    context['ti'].xcom_push(key="{}_record_count".format(full_table_path),
                            value=record_count)