def execute(self, context):
     self.log.info('DataQualityOperator Started!')
     redshift_hook = PostgresHook(self.redshift_conn_id)
     records = redshift_hook.get_records(
         f"SELECT COUNT(*) FROM {self.table}")
     if len(records) < 1 or len(records[0]) < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} has no values")
     num_records = records[0][0]
     if num_records < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} contained 0 rows")
     self.log.info(
         f"Data quality on table {self.table} check passed with {records[0][0]} records"
     )
Esempio n. 2
0
    def execute(self, context):
        redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for table in self.tables:
            records = redshift_hook.get_records(
                f"select count(*) from {table};")

            if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
                self.log.error(
                    f"Data Quality validation failed for table : {table}.")
                raise ValueError(
                    f"Data Quality validation failed for table : {table}")

            self.log.info(
                f"Data Quality Validation Passed on table : {table}!!!")
    def execute(self, context):
        """
        The execute function performs the following steps:
        - Executes the SQL Query for Data Quality check and gets the output of sql query
        - Compares output of SQL query with Expected output
        - If expected output is same as actual output, we see success message in the logs and if not we see an error
        """
        redshift_hook = PostgresHook("redshift")

        for check in self.dq_checks:
            sql = check["check_sql"]
            exp = check["expected_result"]
            records = redshift_hook.get_records(sql)
            exprecords = redshift_hook.get_records(exp)
            num_records = records[0][0]
            exp_records = exprecords[0][0]
            if num_records != exp_records:
                raise ValueError(
                    f"Data quality check failed. Expected: {exp_records} | Got: {num_records}"
                )
            else:
                self.log.info(
                    f"Data quality on SQL {sql} check passed with {records[0][0]} records"
                )
    def execute(self, context):
        redshift = PostgresHook(self.redshift_conn_id)

        for check in self.checks:
            query = check['query']
            expected_result = check['expected_result']

            results = redshift.get_records(query)

            self.log.info(f'Results {results}')

            if results and results[0][0] != expected_result:
                raise ValueError(
                    f"Data quality check failed. "
                    f"The query `{query}` returned {expected_result} records")
Esempio n. 5
0
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     for table in self.tables:
         records = redshift_hook.get_records(
             f"SELECT COUNT(*) FROM {table}")
         if len(records) < 1 or len(records[0]) < 1:
             raise ValueError(
                 f"Data quality check failed. {table} returned no results")
         num_records = records[0][0]
         if num_records < 1:
             raise ValueError(
                 f"Data quality check failed. {table} contained 0 rows")
         logging.info(
             f"Data quality on table {table} check passed with {records[0][0]} records"
         )
    def execute(self, context):
        redshift = PostgresHook(self.redshift_conn_id)
        #table = context['task_instance'].xcom_pull(key='table_name', task_ids='load_public.artists_table')

        records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}")
        if len(records) < 1 or len(records[0]) < 1:
            raise ValueError(
                f"Data quality check failed. {self.table} returned no results")
        num_records = records[0][0]
        if num_records < 1:
            raise ValueError(
                f"Data quality check failed. {self.table} contained 0 rows")
        self.log.info(
            f"Data quality on table {self.table} check passed with {records[0][0]} records"
        )
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     
     for table in self.tables:
         self.log.info(f"Data Quality check for {table['name']} table in progress.")
         
         records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table['name']}")
         num_records = records[0][0]
         
         if len(records) < table['expected_result'] or len(records[0]) < table['expected_result']:
             raise ValueError(f"Data quality check failed. {table['name']} returned no results.")                                       
         elif num_records < table['expected_result']:
             raise ValueError(f"Data quality check failed. {table['name']} contained 0 rows.")             
         else:
             self.log.info(f"Data quality check on table {table['name']} passed with {records[0][0]} records.")
    def execute(self, context):
        """
        This function count rows in destinaton tables and flag if records are populated.
        :param context: defined above
        :return: log messages
        """
        redshift_hook = PostgresHook(self.redshift_conn_id)

        for test in self.sql_test:
            record = redshift_hook.get_records(self.sql_test)
            if record == 0:
                self.log.info(f'zero records running: {self.sql_test}')
            else:
                self.log.info(f'records present running: {self.sql_test}')
                raise ValueError("ValueError exception thrown")
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for check in self.quality_check_tables:
            records = redshift.get_records(check['sql'])[0]
            if check['type'] == 'eq' and records[0] != check['comparison']:
                raise ValueError(
                    f"Data quality check failed! The query {check['sql']} expected {check['comparison']}, but received {records[0]}"
                )
            elif check['type'] == 'gt' and records[0] <= check['comparison']:
                raise ValueError(
                    f"Data quality check failed! The query {check['sql']} expected a value greater the  {check['comparison']}, but received {records[0]}"
                )

        self.log.info("Data Quality Check Completed Succesfully")
Esempio n. 10
0
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     for table in self.tables:
         records = redshift_hook.get_records(
             f"SELECT COUNT(*) FROM {table}")
         if not records or len(records) < 1 or len(records[0]) < 1:
             raise ValueError(
                 f"Data Quality check failed: {table} has no results")
         num_records = records[0][0]
         if num_records < 1:
             raise ValueError(
                 f"Data Quality Check failed: {table} has 0 records")
         self.log.info(
             f"Data Quality on table {table} check passed with {num_records} records"
         )
Esempio n. 11
0
 def execute(self, context):
     redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     for check, expectation in self.sql_checks_and_expects:
         records = redshift.get_records(check)
         if len(records) < 1 or len(records[0]) < 1:
             raise ValueError(
                 f"Data quality check failed. \"{check}\" returned no results"
             )
         result = records[0][0]
         if result != expectation:
             raise ValueError(
                 f"Data quality check failed. "
                 "{check}\" expected {expectation} but returned {result} ")
         self.log.info(f"Data quality check \"{check}\" passed.")
     self.log.info("Successfully completed all data quality checks")
    def execute(self, context):
        if not self.sw_skip_data_quality_checks:

            conn = PostgresHook(self.postgres_conn_id)

            errors = []
            info = []
            for table in self.data_quality_tables:
                records = conn.get_records(f"select count(*) from {table}")
                if len(records) < 1 or len(records[0]) < 1:
                    errors.append(f"Data quality check failed. {table} returned no results")
                else:
                    num_records = records[0][0]
                    if num_records < 1:
                        errors.append(f"ERROR - Data quality check failed. {table} contains {num_records} records")
                    else:
                        info.append(f"SUCCESS - Data quality check passed. {table} contains {num_records} records")

                if table == "artists":
                    columns = ['latitude','location','longitude']
                    expected = [0,0,0]
                    for idx, column in enumerate(columns):
                        records = conn.get_records(f"select count(*) from {table} where 1 = 1 and {column} is null")
                        actual = records[0][0]
                        if actual != expected[idx]:
                            errors.append(f"ERROR - Data quality check failed. {table}.{column} is null...actual {actual} does not equal expected {expected[idx]}")

            for i in info:
                logging.info(i)
        
            errors_encountered = False
            for e in errors:
                errors_encountered = True
                logging.info(e)
            if errors_encountered:
                raise ValueError("Data quality check failed.")
Esempio n. 13
0
    def execute(self, context):
        # AWS Hook
        aws_hook = AwsHook(self.aws_credentials_id)
        credentials = aws_hook.get_credentials()
        # RedShift Hook
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        # Test each table
        for table_dict in self.table_info_dict:
            table_name = table_dict["table_name"]
            column_that_should_not_be_null = table_dict["not_null"]
            # Check number of records (pass if > 0, else fail)
            records = redshift.get_records(
                f"SELECT COUNT(*) FROM {table_name}")
            if len(records) < 1 or len(records[0]) < 1:
                raise ValueError(
                    f"Data quality check failed. {table_name} returned no results"
                )
            elif records[0][0] < 1:
                raise ValueError(
                    f"Data quality check failed. {table_name} contained 0 rows"
                )
            else:
                # Now check is NOT NULL columns contain NULL
                null_records = redshift.get_records(
                    f"SELECT COUNT(*) FROM {table_name} WHERE {column_that_should_not_be_null} IS NULL"
                )
                if null_records[0][0] > 0:
                    col = column_that_should_not_be_null
                    raise ValueError(
                        f"Data quality check failed. {table_name} contained {null_records[0][0]} null records for {col}"
                    )
                else:
                    self.log.info(
                        f"Data quality on table {table_name} check passed with {records[0][0]} records"
                    )
Esempio n. 14
0
    def execute(self, context):
        redshift_hook = PostgresHook(self.redshift_conn_id)
        records = redshift_hook.get_records("SELECT COUNT(*) FROM %s " %
                                            self.table_name)
        if len(records) < 1 or len(records[0]) < 1:
            raise ValueError(
                "Data quality check failed. %s returned no results" %
                self.table_name)
        num_records = records[0][0]
        if num_records < 1:
            raise ValueError("Data quality check failed. %s contained 0 rows" %
                             self.table_name)
        self.log.info("Data quality on table %s check passed with %s records" %
                      (self.table_name, records[0][0]))

        column_records = redshift_hook.get_records(
            "SELECT COUNT(*) FROM %s WHERE %s = NULL" %
            (self.table_name, self.column))
        if len(column_records[0]) > 1:
            raise ValueError(
                "Data quality check failed. %s returned NULL value" %
                self.column)

        self.log.info('Data Quality check done')
Esempio n. 15
0
    def execute(self, context):
        redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        for table in self.tables:
            records = redshift_hook.get_records(self.select_sql.format(table))

            if records is None or len(records[0]) < 1:
                self.log.error(
                    f"No Records Presents in destination table {table}")
                raise ValueError(
                    f"No Records Presents in destination table {table}")

            self.log.info(f"""
                Data Quality Check on table {table} check
                passed with {records[0][0]} records
                """)
Esempio n. 16
0
 def execute(self, context):
     '''
     Checks if the primary key column has any null value. If so throws an exception
     '''
     redshift_hook = PostgresHook(self.redshift_conn_id)
     records = redshift_hook.get_records(
         f"SELECT COUNT(*) FROM {self.table} WHERE {self.pkey_col} IS NULL")
     num_records = records[0][0]
     if num_records > 0:
         raise ValueError(
             f"Data quality check failed. Primary key column {self.pkey_col} in table {self.table} contained {records[0][0]} null values"
         )
     self.log.info(
         f"Data quality check on primary key column {self.pkey_col} in table {self.table} passed with 0 null values"
     )
Esempio n. 17
0
    def execute(self, context):
        self.log.info('DataQualityOperator implemented and running')
        redshift_hook_conn = PostgresHook(
            postgres_conn_id=self.redshift_conn_id)

        self.log.info("Running Data Quality test")
        for test_query in self.test_queries:
            rows = redshift_hook_conn.get_records(test_query)
            if rows[0][0] != self.expected_result:
                raise ValueError(f"""
                    Data quality test failed. \
                    {rows[0][0]} does not equal {self.expected_result}
                """)
            else:
                self.log.info("Data quality test passed")
Esempio n. 18
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.conn_id)

        for table in self.non_empty_tables:
            self.log.info("Counting rows in {} table...".format(table))

            sql = DataQualityOperator.count_sql.format(table_name=table)            
            records = redshift.get_records(sql)
            if len(records) < 1 or len(records[0]) < 1:
                raise ValueError(f"Data quality check failed. {table} returned no results.")

            num_records = records[0][0]
            if num_records < 1:
                raise ValueError(f"Data quality check failed. {table} contained 0 rows.")
            self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records.")
    def execute(self, context):
        self.log.info('LoadDimensionOperator not implemented yet')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        #Delete the table
        records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}")
        num_records = records[0][0]
        if num_records > 0:
            redshift.run("Delete from {}".format(self.table))

        #Insert statement
        sql_statement = ("Insert into {} ({})".format(self.table, self.sql))

        #Run the insert statement
        redshift.run(sql_statement)
Esempio n. 20
0
    def execute(self, context):
        self.log.info(
            'DataQualityOperator: Checking the data itself for nulls')

        redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        records = redshift_hook.get_records(self.sql_test)

        # check if test failed
        if records[0] != self.sql_result:
            raise ValueError("""
                Too many Nulls found. \
                expected {}, but {} found 
            """.format(self.sql_result, records[0]))
        else:
            self.log.info("Data quality check passed")
Esempio n. 21
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for table in self.tables:
            recs = redshift.get_records(self.check_sql.format(table))
            if (len(recs) < 1) or (len(recs[0]) < 0):
                self.log.error("table %s is empty".format(table))
                raise ValueError(
                    "Table %s failed data quality operator".format(table))
            num_records = recs[0][0]
            if num_records == 0:
                self.log.error("No records found in {}".format(table))
                raise ValueError("No records found in {}".format(table))
            pass_msg = "Data Quality check passed of table {}".format(table)
            self.log.info(pass_msg)
Esempio n. 22
0
    def execute(self, context):
        postgres = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        self.log.info("Extracting data from Redshift: %s", self.sql)
        results = postgres.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            self.log.info("Running MySQL preoperator")
            self.log.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        self.log.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table,
                          rows=results,
                          replace=self.replace)
Esempio n. 23
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        for table in self.tables:
            records = redshift.get_records(DQC.count_check.format(table))
            if len(records) < 1 or len(records[0]) < 1:
                self.log.error("{} returned no results".format(table))
                raise ValueError(
                    "Data quality check failed. {} returned no results".format(
                        table))
            num_records = records[0][0]
            if num_records == 0:
                self.log.error(
                    "No records present in destination table {}".format(table))
                raise ValueError(
                    "No records present in destination {}".format(table))
            else:
                self.log.info(
                    "Data quality on table {} check passed with {} records".
                    format(table, num_records))

        staging_counts = redshift.get_records(
            DQC.count_check.format(self.tables[0]) + " WHERE page='NextSong'")
        fact_counts = redshift.get_records(
            DQC.count_check.format(self.tables[2]))
        self.log.info("Staging Table and Fact Table Comparision:")
        self.log.info("Staging Table: {} - {}".format(self.tables[0],
                                                      staging_counts[0][0]))
        self.log.info("Fact Table: {} - {}".format(self.tables[2],
                                                   fact_counts[0][0]))
        if (staging_counts[0][0] == fact_counts[0][0]):
            self.log.info(
                "Count between Staging Table and Fact Table matches.")
        else:
            self.log.info("Incorrect data in Fact Table.")
            raise ValueError("Incorrect data in Fact Table.")
        self.log.info("data quality check completed")
Esempio n. 24
0
    def execute(self, context):
        redshift_hook = PostgresHook(self._redshift_conn_id)

        for tb in self._tables:
            self.log.info(f"Checking data quality for table '{tb}'")
            records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {tb}")
            if len(records[0]) < 1 or len(records) < 1:
                raise ValueError(f"Data quality check failed: "
                                 f"Query of table '{tb}' returned no results")
            num_records = records[0][0]
            if num_records < 1:
                raise ValueError(f"Data quality check failed. "
                                 f"Table '{tb}' contained 0 rows")
            self.log.info(f"Data quality check on table '{tb}' "
                          f"passed with {records[0][0]} records")
Esempio n. 25
0
 def execute(self, context):
     redshift = PostgresHook(self.conn_id)
     for table in self.tables:
         records = redshift.get_records(f"SELECT COUNT(*) FROM {table}")
         if len(records) < 1 or len(records[0]) < 1:
             raise ValueError(
                 f"Data quality check failed. {table} returned no results")
         num_records = records[0][0]
         if num_records < 1:
             raise ValueError(
                 f"Data quality check failed. {table} contained 0 rows")
         self.log.info(
             f"Data quality on table {table} check passed with {records[0][0]} records"
         )
     self.log.info('DataQualityOperator not implemented yet')
 def execute(self, context):
     redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
     for table in self.tables:
         records = redshift_hook.get_records(
             f"SELECT COUNT(*) FROM {table}")
         if len(records) < 1 or len(records[0]) < 1:
             raise ValueError(
                 f"A Data Quality Check has failed for table {table}. No results were returned."
             )
         num_records = records[0][0]
         if num_records < 1:
             raise ValueError(
                 f"A Data Quality Check has failed for table {table}. It contains 0 rows."
             )
         self.log.info('Data Quality Operator has passed the checks')
    def execute(self, context):
        """
        execute - encapsulates functionality of operator-task
        """
        self.log.info(f'DataQualityOperator starting for table: {self.table}')

        redshift = PostgresHook(self.redshift_conn_id)
        records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}")

        if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
            raise ValueError(
                f"Data quality check failed. {self.table} returned no results")
        self.log.info(
            f"Data quality on table {self.table} check passed with {records[0][0]} records"
        )
 def execute(self, context):
     redshift_hook = PostgresHook(self.redshift_conn_id)
     records = redshift_hook.get_records(
         f"SELECT COUNT({self.field}) FROM {self.table};")
     self.log.info(f"Records: " + str(records))
     if len(records) < 1 or len(records[0]) < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} returned no results")
     num_records = records[0][0]
     if num_records < 1:
         raise ValueError(
             f"Data quality check failed. {self.table} contained 0 rows")
     self.log.info(
         f"Data quality on table {self.table} check passed with {records[0][0]} records"
     )
Esempio n. 29
0
    def execute(self, context):
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        for field in self.fields:
            num_rows_query = f"""SELECT COUNT(*) FROM {self.table} WHERE {field} IS NULL;"""
            records = redshift.get_records(num_rows_query)
            num_records = records[0][0]

            if num_records != 0:
                raise ValueError(
                    f"Data quality check failed. {self.table}.{field} contained NULL values"
                )

            self.log.info(
                f"Data quality on {self.table}.{field} check passed with {num_records} NULL record"
            )
Esempio n. 30
0
    def execute(self, context):
        hook_redshift = PostgresHook(postgres_connect_id = self.redshift_connect_id)
        
        for table in self.tables:
            
            self.log.info(f"Starting data quality validation on : {table}")
            records = hook_redshift.get_records(f"select count(*) from {table};")

            if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1:
                self.log.error(f"Data Quality validation failed for table : {table}.")
                raise ValueError(f"Data Quality validation failed for table : {table}")
                
            self.log.info(f"Data Quality Validation Passed on table : {table}") 
            
        self.log.info("DataQualityOperator not implemented yet")