def execute(self, context): self.log.info('DataQualityOperator Started!') redshift_hook = PostgresHook(self.redshift_conn_id) records = redshift_hook.get_records( f"SELECT COUNT(*) FROM {self.table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {self.table} has no values") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {self.table} contained 0 rows") self.log.info( f"Data quality on table {self.table} check passed with {records[0][0]} records" )
def execute(self, context): redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift_hook.get_records( f"select count(*) from {table};") if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: self.log.error( f"Data Quality validation failed for table : {table}.") raise ValueError( f"Data Quality validation failed for table : {table}") self.log.info( f"Data Quality Validation Passed on table : {table}!!!")
def execute(self, context): """ The execute function performs the following steps: - Executes the SQL Query for Data Quality check and gets the output of sql query - Compares output of SQL query with Expected output - If expected output is same as actual output, we see success message in the logs and if not we see an error """ redshift_hook = PostgresHook("redshift") for check in self.dq_checks: sql = check["check_sql"] exp = check["expected_result"] records = redshift_hook.get_records(sql) exprecords = redshift_hook.get_records(exp) num_records = records[0][0] exp_records = exprecords[0][0] if num_records != exp_records: raise ValueError( f"Data quality check failed. Expected: {exp_records} | Got: {num_records}" ) else: self.log.info( f"Data quality on SQL {sql} check passed with {records[0][0]} records" )
def execute(self, context): redshift = PostgresHook(self.redshift_conn_id) for check in self.checks: query = check['query'] expected_result = check['expected_result'] results = redshift.get_records(query) self.log.info(f'Results {results}') if results and results[0][0] != expected_result: raise ValueError( f"Data quality check failed. " f"The query `{query}` returned {expected_result} records")
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) for table in self.tables: records = redshift_hook.get_records( f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {table} contained 0 rows") logging.info( f"Data quality on table {table} check passed with {records[0][0]} records" )
def execute(self, context): redshift = PostgresHook(self.redshift_conn_id) #table = context['task_instance'].xcom_pull(key='table_name', task_ids='load_public.artists_table') records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {self.table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {self.table} contained 0 rows") self.log.info( f"Data quality on table {self.table} check passed with {records[0][0]} records" )
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) for table in self.tables: self.log.info(f"Data Quality check for {table['name']} table in progress.") records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {table['name']}") num_records = records[0][0] if len(records) < table['expected_result'] or len(records[0]) < table['expected_result']: raise ValueError(f"Data quality check failed. {table['name']} returned no results.") elif num_records < table['expected_result']: raise ValueError(f"Data quality check failed. {table['name']} contained 0 rows.") else: self.log.info(f"Data quality check on table {table['name']} passed with {records[0][0]} records.")
def execute(self, context): """ This function count rows in destinaton tables and flag if records are populated. :param context: defined above :return: log messages """ redshift_hook = PostgresHook(self.redshift_conn_id) for test in self.sql_test: record = redshift_hook.get_records(self.sql_test) if record == 0: self.log.info(f'zero records running: {self.sql_test}') else: self.log.info(f'records present running: {self.sql_test}') raise ValueError("ValueError exception thrown")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for check in self.quality_check_tables: records = redshift.get_records(check['sql'])[0] if check['type'] == 'eq' and records[0] != check['comparison']: raise ValueError( f"Data quality check failed! The query {check['sql']} expected {check['comparison']}, but received {records[0]}" ) elif check['type'] == 'gt' and records[0] <= check['comparison']: raise ValueError( f"Data quality check failed! The query {check['sql']} expected a value greater the {check['comparison']}, but received {records[0]}" ) self.log.info("Data Quality Check Completed Succesfully")
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) for table in self.tables: records = redshift_hook.get_records( f"SELECT COUNT(*) FROM {table}") if not records or len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data Quality check failed: {table} has no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data Quality Check failed: {table} has 0 records") self.log.info( f"Data Quality on table {table} check passed with {num_records} records" )
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for check, expectation in self.sql_checks_and_expects: records = redshift.get_records(check) if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. \"{check}\" returned no results" ) result = records[0][0] if result != expectation: raise ValueError( f"Data quality check failed. " "{check}\" expected {expectation} but returned {result} ") self.log.info(f"Data quality check \"{check}\" passed.") self.log.info("Successfully completed all data quality checks")
def execute(self, context): if not self.sw_skip_data_quality_checks: conn = PostgresHook(self.postgres_conn_id) errors = [] info = [] for table in self.data_quality_tables: records = conn.get_records(f"select count(*) from {table}") if len(records) < 1 or len(records[0]) < 1: errors.append(f"Data quality check failed. {table} returned no results") else: num_records = records[0][0] if num_records < 1: errors.append(f"ERROR - Data quality check failed. {table} contains {num_records} records") else: info.append(f"SUCCESS - Data quality check passed. {table} contains {num_records} records") if table == "artists": columns = ['latitude','location','longitude'] expected = [0,0,0] for idx, column in enumerate(columns): records = conn.get_records(f"select count(*) from {table} where 1 = 1 and {column} is null") actual = records[0][0] if actual != expected[idx]: errors.append(f"ERROR - Data quality check failed. {table}.{column} is null...actual {actual} does not equal expected {expected[idx]}") for i in info: logging.info(i) errors_encountered = False for e in errors: errors_encountered = True logging.info(e) if errors_encountered: raise ValueError("Data quality check failed.")
def execute(self, context): # AWS Hook aws_hook = AwsHook(self.aws_credentials_id) credentials = aws_hook.get_credentials() # RedShift Hook redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) # Test each table for table_dict in self.table_info_dict: table_name = table_dict["table_name"] column_that_should_not_be_null = table_dict["not_null"] # Check number of records (pass if > 0, else fail) records = redshift.get_records( f"SELECT COUNT(*) FROM {table_name}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {table_name} returned no results" ) elif records[0][0] < 1: raise ValueError( f"Data quality check failed. {table_name} contained 0 rows" ) else: # Now check is NOT NULL columns contain NULL null_records = redshift.get_records( f"SELECT COUNT(*) FROM {table_name} WHERE {column_that_should_not_be_null} IS NULL" ) if null_records[0][0] > 0: col = column_that_should_not_be_null raise ValueError( f"Data quality check failed. {table_name} contained {null_records[0][0]} null records for {col}" ) else: self.log.info( f"Data quality on table {table_name} check passed with {records[0][0]} records" )
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) records = redshift_hook.get_records("SELECT COUNT(*) FROM %s " % self.table_name) if len(records) < 1 or len(records[0]) < 1: raise ValueError( "Data quality check failed. %s returned no results" % self.table_name) num_records = records[0][0] if num_records < 1: raise ValueError("Data quality check failed. %s contained 0 rows" % self.table_name) self.log.info("Data quality on table %s check passed with %s records" % (self.table_name, records[0][0])) column_records = redshift_hook.get_records( "SELECT COUNT(*) FROM %s WHERE %s = NULL" % (self.table_name, self.column)) if len(column_records[0]) > 1: raise ValueError( "Data quality check failed. %s returned NULL value" % self.column) self.log.info('Data Quality check done')
def execute(self, context): redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift_hook.get_records(self.select_sql.format(table)) if records is None or len(records[0]) < 1: self.log.error( f"No Records Presents in destination table {table}") raise ValueError( f"No Records Presents in destination table {table}") self.log.info(f""" Data Quality Check on table {table} check passed with {records[0][0]} records """)
def execute(self, context): ''' Checks if the primary key column has any null value. If so throws an exception ''' redshift_hook = PostgresHook(self.redshift_conn_id) records = redshift_hook.get_records( f"SELECT COUNT(*) FROM {self.table} WHERE {self.pkey_col} IS NULL") num_records = records[0][0] if num_records > 0: raise ValueError( f"Data quality check failed. Primary key column {self.pkey_col} in table {self.table} contained {records[0][0]} null values" ) self.log.info( f"Data quality check on primary key column {self.pkey_col} in table {self.table} passed with 0 null values" )
def execute(self, context): self.log.info('DataQualityOperator implemented and running') redshift_hook_conn = PostgresHook( postgres_conn_id=self.redshift_conn_id) self.log.info("Running Data Quality test") for test_query in self.test_queries: rows = redshift_hook_conn.get_records(test_query) if rows[0][0] != self.expected_result: raise ValueError(f""" Data quality test failed. \ {rows[0][0]} does not equal {self.expected_result} """) else: self.log.info("Data quality test passed")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.conn_id) for table in self.non_empty_tables: self.log.info("Counting rows in {} table...".format(table)) sql = DataQualityOperator.count_sql.format(table_name=table) records = redshift.get_records(sql) if len(records) < 1 or len(records[0]) < 1: raise ValueError(f"Data quality check failed. {table} returned no results.") num_records = records[0][0] if num_records < 1: raise ValueError(f"Data quality check failed. {table} contained 0 rows.") self.log.info(f"Data quality on table {table} check passed with {records[0][0]} records.")
def execute(self, context): self.log.info('LoadDimensionOperator not implemented yet') redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) #Delete the table records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}") num_records = records[0][0] if num_records > 0: redshift.run("Delete from {}".format(self.table)) #Insert statement sql_statement = ("Insert into {} ({})".format(self.table, self.sql)) #Run the insert statement redshift.run(sql_statement)
def execute(self, context): self.log.info( 'DataQualityOperator: Checking the data itself for nulls') redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) records = redshift_hook.get_records(self.sql_test) # check if test failed if records[0] != self.sql_result: raise ValueError(""" Too many Nulls found. \ expected {}, but {} found """.format(self.sql_result, records[0])) else: self.log.info("Data quality check passed")
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: recs = redshift.get_records(self.check_sql.format(table)) if (len(recs) < 1) or (len(recs[0]) < 0): self.log.error("table %s is empty".format(table)) raise ValueError( "Table %s failed data quality operator".format(table)) num_records = recs[0][0] if num_records == 0: self.log.error("No records found in {}".format(table)) raise ValueError("No records found in {}".format(table)) pass_msg = "Data Quality check passed of table {}".format(table) self.log.info(pass_msg)
def execute(self, context): postgres = PostgresHook(postgres_conn_id=self.redshift_conn_id) self.log.info("Extracting data from Redshift: %s", self.sql) results = postgres.get_records(self.sql) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) if self.mysql_preoperator: self.log.info("Running MySQL preoperator") self.log.info(self.mysql_preoperator) mysql.run(self.mysql_preoperator) self.log.info("Inserting rows into MySQL") mysql.insert_rows(table=self.mysql_table, rows=results, replace=self.replace)
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift.get_records(DQC.count_check.format(table)) if len(records) < 1 or len(records[0]) < 1: self.log.error("{} returned no results".format(table)) raise ValueError( "Data quality check failed. {} returned no results".format( table)) num_records = records[0][0] if num_records == 0: self.log.error( "No records present in destination table {}".format(table)) raise ValueError( "No records present in destination {}".format(table)) else: self.log.info( "Data quality on table {} check passed with {} records". format(table, num_records)) staging_counts = redshift.get_records( DQC.count_check.format(self.tables[0]) + " WHERE page='NextSong'") fact_counts = redshift.get_records( DQC.count_check.format(self.tables[2])) self.log.info("Staging Table and Fact Table Comparision:") self.log.info("Staging Table: {} - {}".format(self.tables[0], staging_counts[0][0])) self.log.info("Fact Table: {} - {}".format(self.tables[2], fact_counts[0][0])) if (staging_counts[0][0] == fact_counts[0][0]): self.log.info( "Count between Staging Table and Fact Table matches.") else: self.log.info("Incorrect data in Fact Table.") raise ValueError("Incorrect data in Fact Table.") self.log.info("data quality check completed")
def execute(self, context): redshift_hook = PostgresHook(self._redshift_conn_id) for tb in self._tables: self.log.info(f"Checking data quality for table '{tb}'") records = redshift_hook.get_records(f"SELECT COUNT(*) FROM {tb}") if len(records[0]) < 1 or len(records) < 1: raise ValueError(f"Data quality check failed: " f"Query of table '{tb}' returned no results") num_records = records[0][0] if num_records < 1: raise ValueError(f"Data quality check failed. " f"Table '{tb}' contained 0 rows") self.log.info(f"Data quality check on table '{tb}' " f"passed with {records[0][0]} records")
def execute(self, context): redshift = PostgresHook(self.conn_id) for table in self.tables: records = redshift.get_records(f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {table} contained 0 rows") self.log.info( f"Data quality on table {table} check passed with {records[0][0]} records" ) self.log.info('DataQualityOperator not implemented yet')
def execute(self, context): redshift_hook = PostgresHook(postgres_conn_id=self.redshift_conn_id) for table in self.tables: records = redshift_hook.get_records( f"SELECT COUNT(*) FROM {table}") if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"A Data Quality Check has failed for table {table}. No results were returned." ) num_records = records[0][0] if num_records < 1: raise ValueError( f"A Data Quality Check has failed for table {table}. It contains 0 rows." ) self.log.info('Data Quality Operator has passed the checks')
def execute(self, context): """ execute - encapsulates functionality of operator-task """ self.log.info(f'DataQualityOperator starting for table: {self.table}') redshift = PostgresHook(self.redshift_conn_id) records = redshift.get_records(f"SELECT COUNT(*) FROM {self.table}") if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: raise ValueError( f"Data quality check failed. {self.table} returned no results") self.log.info( f"Data quality on table {self.table} check passed with {records[0][0]} records" )
def execute(self, context): redshift_hook = PostgresHook(self.redshift_conn_id) records = redshift_hook.get_records( f"SELECT COUNT({self.field}) FROM {self.table};") self.log.info(f"Records: " + str(records)) if len(records) < 1 or len(records[0]) < 1: raise ValueError( f"Data quality check failed. {self.table} returned no results") num_records = records[0][0] if num_records < 1: raise ValueError( f"Data quality check failed. {self.table} contained 0 rows") self.log.info( f"Data quality on table {self.table} check passed with {records[0][0]} records" )
def execute(self, context): redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id) for field in self.fields: num_rows_query = f"""SELECT COUNT(*) FROM {self.table} WHERE {field} IS NULL;""" records = redshift.get_records(num_rows_query) num_records = records[0][0] if num_records != 0: raise ValueError( f"Data quality check failed. {self.table}.{field} contained NULL values" ) self.log.info( f"Data quality on {self.table}.{field} check passed with {num_records} NULL record" )
def execute(self, context): hook_redshift = PostgresHook(postgres_connect_id = self.redshift_connect_id) for table in self.tables: self.log.info(f"Starting data quality validation on : {table}") records = hook_redshift.get_records(f"select count(*) from {table};") if len(records) < 1 or len(records[0]) < 1 or records[0][0] < 1: self.log.error(f"Data Quality validation failed for table : {table}.") raise ValueError(f"Data Quality validation failed for table : {table}") self.log.info(f"Data Quality Validation Passed on table : {table}") self.log.info("DataQualityOperator not implemented yet")