def execute(self, context):
        self.log.info(
            f"LoadDimensionOperator starts execution for table '{self.dim_table_name}'"
        )

        # Connection to Redshift
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info("Connection to Redshift has been made.")

        # Create dim table
        self.log.info(
            f"Create dim table '{self.dim_table_name}' if not exists")
        redshift.run(f"{self.dim_table_sql_create}")
        self.log.info(
            f"Dimension table '{self.dim_table_name}' has been created")

        # LOG information
        self.log.info(
            f"Inserting entries to dim table '{self.dim_table_name}'")

        # Check for the STATS output to see in log file how many rows have been inserted
        entries_before = redshift.get_first(
            f"SELECT COUNT(1) FROM {self.dim_table_name};")

        # Insert data into dim table. Operation_mode should be in 'truncate_load' mode
        if self.operation_mode == "truncate_load":
            self.log.info(
                f"Data for dim table '{self.dim_table_name}' works in "
                f"'{self.operation_mode}' mode.")

            redshift.run(f"{self.dim_table_sql_truncate}")
            redshift.run(f"{self.dim_table_sql_insert}")

            self.log.info(
                f"Data for dim table '{self.dim_table_name}' has been inserted."
            )

        elif self.operation_mode == "append_only":
            self.log.info(
                f"Data for dim table '{self.dim_table_name}' works in "
                f"'{self.operation_mode}' mode.")

            redshift.run(f"{self.dim_table_sql_insert}")
            self.log.info(
                f"Adding data to dimension table '{self.dim_table_name}' should in "
                f"operation_mode = 'truncate_load'. This mode inserts ONLY new entries!"
            )
        else:
            raise ValueError(
                f"Please configure operation_mode == (\"truncate_load\" | \"append_only\")."
            )

        # Check for the STATS output to see in log file how many rows have been inserted
        entries_after = redshift.get_first(
            f"SELECT COUNT(1) FROM {self.dim_table_name};")
        entries_inserted = entries_after[0] - entries_before[0]

        self.log.info(
            f"STATS: Before insert: {entries_before[0]}; After  insert: {entries_after[0]}; "
            f"Diff: {entries_inserted}")
Example #2
0
def create_common_countries_table():
    '''
    Creates a common country_or_area table from commodities_staging and temperature_staging
    '''

    table = "country_or_area"
    postgres_hook = PostgresHook(postgres_conn_id='postgres', schema='world')
    engine = postgres_hook.get_sqlalchemy_engine()

    min_year_commodities = postgres_hook.get_first(
        "select min(year) from commodities_staging;")[0]
    max_year_commodities = postgres_hook.get_first(
        "select max(year) from commodities_staging;")[0]

    get_countries_from_commodities_staging = "select distinct(country_or_area) from commodities_staging;"
    get_countries_from_temperature_staging = f"select distinct(country_or_area) from temperature_staging where year >= {min_year_commodities} and year <= {max_year_commodities};"

    commodities_countries_records = postgres_hook.get_records(
        get_countries_from_commodities_staging)
    temperature_countries_records = postgres_hook.get_records(
        get_countries_from_temperature_staging)

    commodities_countries_set = set(
        reduce(operator.concat, commodities_countries_records))
    temperature_countries_set = set(
        reduce(operator.concat, temperature_countries_records))

    common_country_set = commodities_countries_set.union(
        temperature_countries_set)
    print(f"common_country_set: {common_country_set}")

    country_or_area_df = pd.DataFrame(list(common_country_set),
                                      columns=['country_or_area'])
    country_or_area_df.to_sql(table, engine, index=False, if_exists="append")
Example #3
0
    def execute(self, context):
        redshift_hook = PostgresHook("redshift")

        for stmt in self.check_stmts:
            result = int(redshift_hook.get_first(sql=stmt['sql'])[0])

            # check if equal
            if stmt['op'] == 'eq':
                if result != stmt['val']:
                    raise AssertionError(
                        f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}"
                    )
            # check if not equal
            elif stmt['op'] == 'ne':
                if result == stmt['val']:
                    raise AssertionError(
                        f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}"
                    )
            # check if greater than
            elif stmt['op'] == 'gt':
                if result <= stmt['val']:
                    raise AssertionError(
                        f"Data Quality Check failed: {result} {stmt['op']} {stmt['val']}"
                    )

            self.log.info(
                f"Data Quality Check Passed: {result} {stmt['op']} {stmt['val']}"
            )
Example #4
0
    def execute(self, context):
        """
        Perform data quality checks on resulting fact and dimension tables.

        Parameters:
        ----------
        redshift_conn_id: string
            airflow connection to redshift cluster
        table: string
            table located in redshift cluster
        test_stmt: string
            test SQL command to check validity of target table
        result: string
            result of test_stmt to check validity
        """
        pg_hook = PostgresHook(self.redshift_conn_id)
        records = pg_hook.get_records(f"SELECT COUNT(*) FROM {self.table}")
        if len(records) < 1 or len(records[0]) < 1:
            raise ValueError(f"Fail: No results for {self.table}")
        num_records = records[0][0]
        if num_records < 1:
            raise ValueError(f"Fail: 0 rows in {self.table}")

        if self.test_stmt:
            output = pg_hook.get_first(self.test_stmt)
            if self.result != output:
                raise ValueError(f"Fail: {output} != {self.result}")
        self.log.info(f"Success: {self.table} has {records[0][0]} records")
    def execute(self, context):
        """
        Perform data quality checks on resulting fact and dimension tables.
        Parameters:
        ----------
        redshift_conn_id: string
            airflow connection to redshift cluster
        table: string
            table located in redshift cluster
        test_stmt: string
            test SQL command to check validity of target table
        result: string
            result of test_stmt to check validity
        """

        aws_hook = PostgresHook(self.redshift_conn_id)
        for table in self.tables:
            records = aws_hook.get_records(f"SELECT COUNT(*) FROM {table}")

            if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
                raise ValueError(
                    f"Data quality check failed. {table} returned no results")
                self.log.error(
                    f"Data quality check failed. {table} returned no results")

            self.log.info(
                f"Data quality on table {table} check passed with {records[0][0]} records"
            )

        if self.test_stmt:
            output = aws_hook.get_first(self.test_stmt)
            if self.result != output:
                raise ValueError(f"Fail: {output} != {self.result}")
Example #6
0
    def execute(self, context):
        self.log.info(
            'Detect number of entries per table, optionally compare to expected numbers'
        )

        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for idx, table in enumerate(self.tables):
            query = f"SELECT COUNT(*) FROM {table};"
            count = redshift.get_first(query)[0]
            print(f"result of {query} is {count}")
            if len(self.expected_counts) > 0:
                expected_count = self.expected_counts[idx]
                if count != expected_count:
                    print(
                        f"Validation error: table {table} contains {count} records while {expected_count} where expected."
                    )
                else:
                    print(
                        f"Validation success: table {table} contains {count} records as expected."
                    )
            else:
                if count > 0:
                    print(
                        f"Validation success: table {table} contains {count} records."
                    )
                else:
                    print(
                        f"Validation error: table {table} contains no records while some records where expected."
                    )

        return True
    def execute(self, context):
        self.log.info('DataQualityOperator not implemented yet')
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for table in self.tables:
            for stmt in self.stmts_checks:
                self.log.info(stmt['sql'].format(self.sql_schema, table))
                sql = stmt['sql'].format(self.sql_schema, table)
                result = int(redshift.get_first(sql)[0])
                strError = 'Check failed: {} {} {}'.format(
                    result, stmt['op'], stmt['val'])
                # Check greater than
                if stmt['op'] == 'gt' and result <= stmt['val']:
                    raise AssertionError(strError)
                # Check equal
                elif stmt['op'] == 'eq' and result != stmt['val']:
                    raise AssertionError(strError)
                # Check if not equal
                elif stmt['op'] == 'ne' and result == stmt['val']:
                    raise AssertionError(strError)

            self.log.info('Passed check: {} {} {}'.format(
                result, stmt['op'], stmt['val']))
    def execute(self, context):
        '''
        Perform data quality checks by running a list of queries.
        Parameters:
        ----------
        conn_id (string) : Airflow connection to redshift cluster
        queries   (list) : List of check queries, specified as {'sql':'SELECT COUNT(*) FROM time WHERE hour < 0', 'expect':0}           
        '''
        redshift = PostgresHook(postgres_conn_id=self.conn_id)

        for query in self.queries:

            sql = query.get('sql')
            if sql is None:
                self.log.error(
                    'Data quality check: no SQL expression specified.')
                break

            expect = query.get('expect')
            if expect is None:
                expect = 0

            count = redshift.get_first(sql)[
                0]  #https://stackoverflow.com/a/59420411
            if (count != expect):
                self.log.error(
                    f'Check failed: {sql} returns {count}, expected: {expect}')
            else:
                self.log.info(f'Check passed: {sql} returns {count}.')
Example #9
0
    def execute(self, context) -> None:
        postgres_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
        credentials = s3_hook.get_credentials()
        credentials_block = build_credentials_block(credentials)
        copy_options = '\n\t\t\t'.join(self.copy_options)

        copy_statement = self._build_copy_query(credentials_block, copy_options)

        self.log.info("Creating the staging table...")
        postgres_hook.run(self.create_table_sql)
        self.log.info("Creating the staging table complete...")

        self.log.info('Executing COPY command...')
        postgres_hook.run(copy_statement)
        self.log.info("COPY command complete...")

        self.log.info("Logging the number of rows and files on S3 affected...")
        number_of_rows = postgres_hook.get_first(f"SELECT count(*) FROM {self.schema}.{self.table}")[0]
        number_of_keys_s3 = s3_hook.list_keys(bucket_name=self.s3_bucket, prefix=self.s3_key)

        self.log.info(f"{self.schema}.{self.table} has {number_of_rows} rows")
        self.log.info(f"{self.s3_bucket}/{self.s3_key} has {len(number_of_keys_s3)} files")

        self.log.info("Logging the number of rows and files on S3 affected complete...")
Example #10
0
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     for quality_check in self.queries_and_results:
         self.log.info("Running data validation query")
         result = redshift.get_first(quality_check['query'])
         self.log.info(f"result: {result}")
         if result[0] != quality_check['result']:
             raise ValueError
Example #11
0
 def execute(self, context):
     redshift_hook=PostgresHook(self.redshift_conn_id)
     for (sql_query,result) in self.sql_queries:
         row=redshift_hook.get_first(sql_query)
         if row is not None and row[0] == result:
             self.log.info('The Result {} is matched with expected Result {}\n==================================='.format(sql_query,result)
         else:
              raiseValueErorr('Test Faild : {} not equal {} \n==================================='.format(sql_query,result))               
Example #12
0
 def test_writer(self):
     rows = [{"foo": "%s" % i} for i in range(0, 100)]
     with self.dataset.get_writer(chunksize=10) as writer:
         for row in rows:
             writer.write_row_dict(row)
     db = PostgresHook("postgres_test")
     count = db.get_first("SELECT COUNT(*) FROM test.test")[0]
     self.assertEqual(count, 100)
Example #13
0
def check_table_exists(check_table_exists_sql):
    print("checking sql={}".format(check_table_exists_sql))
    hook = PostgresHook()
    records = hook.get_first(check_table_exists_sql)
    if records is None:
        return "create_table"
    else:
        return "skip_table_creation"
def monitor_redshift_db(**op_kwarg):
    """Redshift database monitor collects the following metrics:
        - Number of tables in database
        - Shape of each table in the database
        - Min, max, mean, median number of rows across all tables,
        - Min, max, mean, median number of columns across all tables,
        - Total number of rows and columns
        - Largest tables by row and column
        - Disk capacity, Free space on disk, Used space on disk (in GB)
        - Disk percent usage
    """
    hook = PostgresHook(REDSHIFT_CONN_ID)
    num_redshift_tables = hook.get_first(COUNT_TABLES, parameters=[TARGET_SCHEMA])[0]
    log_metric("table count", num_redshift_tables)

    table_row_counts = hook.get_records(COUNT_TABLE_ROWS, parameters=[TARGET_SCHEMA])
    num_rows_per_table = {}
    for tablename, row_count in table_row_counts:
        num_rows_per_table[tablename] = int(round(row_count))

    row_counts = list(num_rows_per_table.values())
    log_metric("Max table row count", max(row_counts))
    log_metric("Min table row count", min(row_counts))
    log_metric("Mean table row count", round(mean(row_counts), 2))
    log_metric("Median table row count", median(row_counts))

    tables = hook.get_pandas_df(DESCRIBE_TABLES, parameters=[TARGET_SCHEMA])
    table_shapes = DataFrame()
    table_shapes["columns"] = tables.groupby("tablename").nunique("column")["column"]
    table_shapes["tablename"] = tables["tablename"].unique()
    table_shapes["rows"] = (
        table_shapes["tablename"].map(num_rows_per_table).fillna(0).astype(int)
    )

    for _, row in table_shapes.iterrows():
        log_metric("{} shape".format(row["tablename"]), (row["columns"], row["rows"]))

    log_metric("Max table column count", table_shapes["columns"].max())
    log_metric("Min table column count", table_shapes["columns"].max())
    log_metric("Mean table column count", round(table_shapes["columns"].mean(), 2))
    log_metric("Median table column count", table_shapes["columns"].median())

    log_metric("Total columns", table_shapes["columns"].sum())
    log_metric("Total rows", table_shapes["rows"].sum())

    max_row_table = table_shapes[table_shapes["rows"] == table_shapes["rows"].max()]
    max_col_table = table_shapes[
        table_shapes["columns"] == table_shapes["columns"].max()
    ]
    log_metric("Largest table (by row count)", max_row_table["tablename"][0])
    log_metric("Largest table (by col count)", max_col_table["tablename"][0])

    disk_stats = hook.get_records(DISK_USAGE).pop()
    disk_capacity, disk_used, disk_free = disk_stats
    log_metric("Disk capacity (GB)", disk_capacity)
    log_metric("Disk used (GB)", disk_used)
    log_metric("Disk free (GB)", disk_free)
    log_metric("Percent Disk usage", round((disk_used / disk_capacity) * 100, 2))
Example #15
0
 def execute(self, context):
     self.log.info(f'Running Data Quality {'tests' if len(self.sql_test)>1 else 'test'}')
     hook = PostgresHook(self.conn_id)
     for (test_sql, expectation) in zip(self.sql_test,self.expected_result):
         result = hook.get_first(test_sql)[0]
         if str(result)!=expectation:
             self.log.info(f"Running test SQL: \n{test_sql}")
             raise ValueError(f'This test did not pass \n{test_sql}')
     self.log.info("All tests passed!")
Example #16
0
class PostgresXcomOperator(PostgresOperator):
    """ Regular PostgresOperator does not return a value,
        so cannot do Xcom
    """
    def execute(self, context):
        self.log.info("Executing: %s", self.sql)
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id,
                                 schema=self.database)
        return self.hook.get_first(self.sql, parameters=self.parameters)
Example #17
0
def last_timestamp_loaded(ops_type):
    pg_hook = PostgresHook(postgres_conn_id='ctd')
    ret = pg_hook.get_first(
        'select max(ops_timestamp) from ods.tz_data where ops_type = %s',
        parameters=(ops_type, ))
    if not ret[0]:
        return utc_timestamp(INITIAL_LOAD_UTC)
    else:
        return ret[0]
 def runSql(self, x):
     myObj = self.tables
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     self.log.info('connection to Redshift successful...')
     formatted_sql = DataQualityOperator.statement.format(
         myObj["tables"][x], myObj["fields"][x])
     num = redshift.get_first(formatted_sql)
     num = num[0]
     return num
Example #19
0
    def execute(self, context):
        self.log.info('Checking data quality')

        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        result = redshift.get_first(self.sql_test)
        result = result[0]

        if result != self.expected_result:
            self.log.info(f'Generated result: {result}')
            raise ValueError("Data validation fails")
Example #20
0
    def execute(self, context):
        self.log.info('Running DataQualityOperator')

        redshift = PostgresHook(postgres_conn_id=self.conn_id)

        for f in self.fmt:
            query = self.query.format(f)
            res = redshift.get_first(query)[0]

            if res == self.failure_value:
                raise ValueError(f"failed query {query}, failure {self.failure_value}")
Example #21
0
    def init():
        hook = PostgresHook()
        query = '''
                    SELECT 1
                    FROM information_schema.tables
                    WHERE table_schema = %s
                          AND table_name = %s
        '''
        is_exist = hook.get_first(sql=query, parameters=(schema, table))

        return ['do_nothing' if is_exist else 'create_table']
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     for sql, expected_result in self.checks:
         result = redshift.get_first(sql)[0]
         assert (result == expected_result), f"""
             Data quality check failed!
             Query: {sql}
             Expected result: {expected_result}
             Actual result: {result}
         """
     self.log.info("All checks passed!")
Example #23
0
def execute_data_quality_checks(query, tables):
  logger = logging.getLogger(__name__)
  pg_hook = PostgresHook('redshift_lake')

  for table in tables:
    statement = query + table
    result = pg_hook.get_first(statement)
    if result is None:
      raise Exception(f'Load of data into table {table} failed, please review')

  return True
    def _extract_last_updated_value(self) -> str:
        hook = PostgresHook(postgres_conn_id=self.destination_conn_id)
        last_updated_field = hook.get_first(sql=self.last_updated_sql)[0]
        if not last_updated_field:
            self.log.info(
                f'Last event value not found, ' + (
                    f'using default value - {self.default_last_updated_value}'),
            )
            return self.default_last_updated_value

        self.log.info(f'Last event value was {last_updated_field}')
        return last_updated_field
class PostgreSQLCountRowsOperator(BaseOperator):
    @apply_defaults
    def __init__(self, table_name, postgres_conn_id, *args, **kwargs):
        self.table_name = table_name
        self.postgres_conn_id = postgres_conn_id
        self.hook = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        super(PostgreSQLCountRowsOperator, self).__init__(*args, **kwargs)

    def execute(self, context):
        result = self.hook.get_first(
            sql=f'SELECT COUNT(*) FROM {self.table_name};')
        return result
    def execute(self, context):
        redshift_hook = PostgresHook(self.redshift_conn_id)
        self.log.info('DataQualityOperator started.')
        for (sql_stmt, answer) in self.sql_stmts:
            row = redshift_hook.get_first(sql_stmt)
            if row is not None:
                if row[0] == answer:
                    self.log.info("Test {} Passed.".format(sql_stmt))
                else:
                    raise ValueError("Test {} Failed.".format(sql_stmt))

        self.log.info('DataQualityOperator finished.')
Example #27
0
    def execute(self, context):
        # Establish Connection Hooks
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # Execute Tests & Compare to Expected Results
        for test in range(len(self.test_queries)):
            test_result = redshift.get_first(self.test_queries[test])
            if test_result[0] != self.expected_results[test]:
                raise ValueError('Test no. {} failed;\n {}'.format(
                    test, self.test_queries[test]))
            else:
                self.log.info("Test no. {} passed".format(test))
Example #28
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        test_result = redshift.get_first(self.sql)

        # Test result is a tuple, extract only the first element, which should be a number
        if self.test_operator(test_result[0], self.result):
            self.log.info(f"Data quality check passed!")
        else:
            raise ValueError(
                f"Test did not pass. Test result: {test_result}, expected: {self.result}."
            )
Example #29
0
 def f_check_table_exists(table_name):
     connect = PostgresHook(postgres_conn_id=database)
     query = """
         select count(1) 
           from information_schema.tables 
          where table_schema not like  %s
            and table_name = %s
         """
     res = connect.get_first(query, parameters=('', table_name))
     if res[0] == 0:
         return 'create_table'
     else:
         return 'table_exists'
Example #30
0
def query_latest_id(task_type, etl_hook: PostgresHook):
    """
    查询最新id
    :param task_type:
    :param etl_hook:
    :return:
    """

    la = etl_hook.get_first(
        "select task_type,latest_id from cn_spider_snapshot where task_type=%s",
        parameters=(task_type, ))
    if la and la[1]:
        return la[1]