def execute(self, context):
        self.log.info('Connecting to redshift!')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)

        for table in self.tables:
            records = redshift.get_records(f"SELECT COUNT(*) FROM {table}")
            if len(records) < 1 or len(records[0]) < 1:
                raise ValueError(
                    f"Data quality check failed. {table} returned no results")
            num_records = records[0][0]
            if num_records < 1:
                raise ValueError(
                    f"Data quality check failed. {table} contained 0 rows")

            for col in self.columns[table]:
                records = redshift.get_records(
                    f"SELECT COUNT(*) FROM {table} WHERE {col} IS NULL")
                num_records = records[0][0]
                if num_records > 0:
                    raise ValueError(
                        f"The column {col} in table {table} had a NULL value!")

            self.log.info(
                f"Data quality on table {table} check passed with {num_records} records"
            )
    def execute(self, context):
        """
        Data Quality Checks:
        1. Check the target table has a positive number of rows
        2. Check the target table has no duplicate primary key
        Args:
            context:

        Returns:
            None
        """
        hook = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        qf_row_count = self.q_row_count.format(schema=self.schema,
                                               table=self.table)
        self.log.info('Starting Data Quality Checks')
        # Test for presence of any records
        records = hook.get_records(qf_row_count)
        if any([len(records) < 1, len(records[0]) < 1, records[0][0] < 1]):
            self.log.error("{} returned no lines".format(self.table))
            raise ValueError("{} returned no lines".format(self.table))
        del records
        qf_dupes = self.q_dupes.format(schema=self.schema,
                                       table=self.table,
                                       pkey=self.pkey)
        # Test for no duplicates
        records = hook.get_records(qf_dupes)
        if records[0][0] > 1:
            self.log.error("{} returned  duplicates".format(self.table))
            raise ValueError("{} returned duplicates".format(self.table))
        self.log.info("Data Quality checked passed on {}".format(self.table))
        pass
    def execute(self, context):
        self.log.info('Start DataQualityOperator')
        redshift = PostgresHook(postgres_conn_id=self.redshift_conn_id)
        error_messages = "Data quality check failed"
        
        for table_dict in self.tables_list:
            if 'data_quality' in table_dict:
                table = table_dict['name']
                task_name = f"Check for records in table {table}"
                min_records = 0
                records_count = 0

                if 'minimum_records' in table_dict['data_quality']:
                    min_records = table_dict['data_quality']['minimum_records']

                self.log.info(task_name)

                records = redshift.get_records(f"SELECT COUNT(*) FROM {table}")

                if len(records) >= 1 and len(records[0]) >= 1 :
                    records_count = records[0][0]
                    if records_count < min_records:
                        raise ValueError(
                            f"""
                            {error_messages}
                            for {task_name}
                            found ${records_count},
                            Expected a minimum of {min_records} records.
                            """
                            )
                else:
                    raise ValueError(f"{error_messages}. No result for {task_name}")
                
                if 'not_null_columns' in table_dict['data_quality'] and records_count > 0:
                    not_null_columns = table_dict['data_quality']['not_null_columns']

                    for column in not_null_columns:
                        check_null_task_name = f"Check for null values in {table}.{column}"
                        self.log.info(check_null_task_name)
                        count_nulls = redshift.get_records(f"SELECT COUNT(*) FROM {table} WHERE {column} is null")
                        
                        if len(count_nulls) >= 1 and len(count_nulls[0]) >= 1 :
                            null_values = count_nulls[0][0]
                            if null_values > 0:
                                raise ValueError(
                                    f"""
                                    {error_messages}
                                    for {check_null_task_name},
                                    Found {null_values} null records in {table}.{column}.
                                    """
                                    )
                        else:
                            raise ValueError(f"{error_messages}. No result for {check_null_task_name}")
def transfer_oltp_olap(**kwargs):
    """Get records from OLTP and transfer to OLAP database"""
    dest_table = kwargs.get('dest_table')
    sql = kwargs.get('sql')
    params = kwargs.get('params')

    oltp_hook = PostgresHook(postgres_conn_id='oltp')
    olap_hook = PostgresHook(postgres_conn_id='olap')
    data_extracted = oltp_hook.get_records(sql=sql, parameters=params)
    olap_hook.insert_rows(dest_table, data_extracted, commit_every=1000)
    def execute(self, context):
        """
        Description: This custom function implements one or more data quality checks that are passed as
                     SQL commands in the data_quality_checks list, executes them and checks the
                     return value for correctness. If everything fits, this function works without any problems.
                     If there is a disagreement, an error is thrown.

        Arguments:
            self: Instance of the class
            context: Context dictionary

        Returns:
            None
        """

        # Build connection
        postgres = PostgresHook(postgres_conn_id=self.postgres_conn_id)

        # If no quality checks were specified, the function is terminated
        if len(self.data_quality_checks) <= 0:
            self.log.info(
                'No data quality checks were specified. Data quality checks canceled.'
            )
            return

        # Here every single quality check is run through, the associated SQL command is executed and the return value is checked.
        for check in self.data_quality_checks:
            sql_query = check.get('sql_query')
            expected_result = check.get('expected_result')

            try:
                self.log.info(
                    'Starting SQL query for data check - {}'.format(sql_query))
                records = postgres.get_records(sql_query)
                num_records = records[0][0]

                if num_records != expected_result:
                    raise ValueError(
                        'Data quality check failed. {} entries excpected. {} given'
                        .format(expected_result, num_records))
                else:
                    self.log.info(
                        'Data Check passed for query - {}. Result: {}'.format(
                            sql_query, num_records))

            except ValueError as v:
                self.log.info(v.args)
                raise
            except Exception as e:
                self.log.info(
                    'SQL query for data check failed - {}. Exception: {}'.
                    format(sql_query, e))
                raise
    def execute(self, context):
        connection = PostgresHook(postgres_conn_id=self.postgres_conn_id)
        results = connection.get_records(
            "SELECT max(cdc_case_earliest_dt) FROM covid_per_popgroup")
        last_datetime = results[0][0]
        last_date = datetime(last_datetime.year, last_datetime.month,
                             last_datetime.day)
        self.log.info(f"last_date : {last_date}, {type(last_date)}")

        #str_date = results[0][0].isoformat()
        str_date = last_date.isoformat() + ".000"
        self.log.info(f"str_date : {str_date}, {type(str_date)}")
        context["task_instance"].xcom_push(key="last_cdc_date", value=str_date)
 def execute(self, context):
     redshift = PostgresHook(self.redshift_conn_id)
     for num, query in enumerate(self.sql_check):
         self.log.info("Executing data quality check query:")
         self.log.info(query)
         result = redshift.get_records(query)
         if len(result) < 1 or len(result[0]) < 1:
             raise ValueError(
                 "Data quality check failed. No rows returns for query {}".
                 format(query))
         num_records = result[0][0]
         if self.expected_results[num] != num_records:
             raise ValueError(
                 "Data quality check failed: {}. Expecting num of records {}, but returned {}"
                 .format(query, self.expected_results[num], num_records))
     self.log.info("All data quality checks are passed")