Example #1
0
    def test_data_quality_info_length_check(self):
        json_config = {
            'specific_dq_function': 'is_valid_length',
            'specific_dq_function_param_1': '15',
            'suspend_record': 'Yes',
            'type': 'Length Check',
            'always': '',
            'only_if_data_exists': 'x',
            'exception_message': 'Incorrect Length',
            'replacement_value': 'N/A'
        }

        dq = DataQualityInfo(json_config)

        self.assertTrue(dq.suspend_record)
        self.assertEqual('Incorrect Length', dq.exception_message)
        self.assertEqual(None, dq.replacement_value)

        # Too short
        self.assertFalse(dq.validate(''))
        self.assertFalse(dq.validate('ABCD'))
        self.assertFalse(dq.validate('12345678901234'))

        # Correct length
        self.assertTrue(dq.validate('ABCDEFGHIJKLMNO'))
        self.assertTrue(dq.validate('123456789012345'))

        # Too long
        self.assertFalse(dq.validate('ABCDEFGHIJKLMNOP'))
        self.assertFalse(dq.validate('1234567890123456'))
        self.assertFalse(dq.validate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
Example #2
0
    def test_data_quality_info_no_leading_spaces(self):
        json_config = {
            'specific_dq_function': 'no_leading_space',
            'specific_dq_function_param_1': '',
            'suspend_record': 'No',
            'type': 'Leading Spaces',
            'always': '',
            'only_if_data_exists': 'x',
            'exception_message': 'Leading Spaces',
            'replacement_value': 'Leading Spaces in Value'
        }

        dq = DataQualityInfo(json_config)

        self.assertFalse(dq.suspend_record)
        self.assertEqual('Leading Spaces', dq.exception_message)
        self.assertEqual('Leading Spaces in Value', dq.replacement_value)

        self.assertTrue(dq.validate('ABCD'))
        self.assertFalse(dq.validate(' ABCD'))
        self.assertFalse(dq.validate('   '))
        self.assertTrue(dq.validate('0'))
Example #3
0
    def test_data_quality_info_is_numeric(self):
        json_config = {
            'specific_dq_function': 'is_numeric',
            'specific_dq_function_param_1': '',
            'suspend_record': 'Yes',
            'type': 'Numeric-only check',
            'always': '',
            'only_if_data_exists': 'x',
            'exception_message': 'Non-numeric Value',
            'replacement_value': 'N/A'
        }

        dq = DataQualityInfo(json_config)

        self.assertTrue(dq.suspend_record)
        self.assertEqual('Non-numeric Value', dq.exception_message)
        self.assertEqual(None, dq.replacement_value)

        self.assertFalse(dq.validate('ABCD'))
        self.assertTrue(dq.validate('1236717823681'))
        self.assertTrue(dq.validate('0000'))
        self.assertFalse(dq.validate(''))
        self.assertTrue(dq.validate('0'))
        self.assertFalse(dq.validate('a1d3'))
Example #4
0
    def check_data_quality(cls, item, json_config, pk_list, logger):
        """
        Takes values from 'pp_' fields and runs DQ checks, adding replacement
        values if needed.

        Suspends record if needed.
        """
        # out dict to hold the processed item
        out_dict = {}
        invalid_keys = ['rec_type_cd', 'rec_trigger_key', '_sa_instance_state']

        for key, value in item.items():

            if key in invalid_keys:
                continue

            # add the pks to the out_dict so the row can be inserted later
            if key in pk_list:
                out_dict[key] = value

            # skip keys from invalid_keys and keys that aren't 'pp_'
            if not key.startswith('pp_'):
                continue

            # get DQ checks for current key
            dq_list = DataQualityProcessor.get_dq_checks_for_key(
                key, json_config)
            dq_key = key.replace('pp_', 'dq_')

            # keep track of dq exception number
            dq_exception_count = 0
            #pdb.set_trace()
            # do DQ checks if exist
            if dq_list:
                for dq_check in dq_list:
                    # create DataQualityInfo for each DQ check

                    data_quality_info = DataQualityInfo(dq_check)

                    if dq_check[
                            'type'] == 'Date check' and key != 'pp_z13_open_date':
                        continue
                    elif dq_check[
                            'type'] == 'Date check' and key == 'pp_z13_open_date':

                        val = value.rstrip()

                        if dq_exception_count == 1 and data_quality_info.only_if_data_exists:
                            continue
                        is_passing = data_quality_info.validate(val)
                        #import pdb; pdb.set_trace()
                        if is_passing:
                            # write value to out_dict because it passes

                            out_dict[dq_key] = val
                            out_dict[
                                'rm_dq_check_excptn_cnt'] = dq_exception_count

                        else:

                            dq_exception_count = dq_exception_count + 1
                            out_dict[
                                'rm_dq_check_excptn_cnt'] = dq_exception_count

                            logger.error(
                                f'\t{dq_key} failed {data_quality_info.type}. Replacement value is {data_quality_info.replacement_value}.'
                            )
                            # find replacement and use it if needed
                            out_dict[
                                dq_key] = data_quality_info.replacement_value

                    else:
                        # trim trailing spaces of the value
                        # might cause problems for sublibrary code and collection code
                        val = value.rstrip()
                        # determine if value passes check
                        is_passing = data_quality_info.validate(val)
                        # if the value has an exception count of 1, it likely has a missing value
                        # skip the check if it has "only if data exists" flag
                        if dq_exception_count == 1 and data_quality_info.only_if_data_exists:
                            continue

                        if is_passing:
                            # write value to out_dict because it passes
                            out_dict[dq_key] = val
                            out_dict[
                                'rm_dq_check_excptn_cnt'] = dq_exception_count

                        else:
                            # check for suspend record is True

                            dq_exception_count = dq_exception_count + 1
                            out_dict[
                                'rm_dq_check_excptn_cnt'] = dq_exception_count

                            if data_quality_info.suspend_record:

                                logger.error(
                                    f'\t{dq_key} with value of {val} failed {data_quality_info.type}. SUSPENDED'
                                )
                                # out_dict for the current dq_ key contains same value.
                                out_dict[dq_key] = 'SUS'

                                # change suspend record flag
                                suspend_record_flag = "Y"
                                out_dict[
                                    'rm_suspend_rec_flag'] = suspend_record_flag
                                # increment exception count
                                #dq_exception_count = dq_exception_count + 1

                                #out_dict['rm_dq_check_excptn_cnt'] = dq_exception_count

                                # get suspend record code
                                suspend_record_code = DataQualityProcessor.get_suspend_record_code(
                                    dq_key, data_quality_info)
                                out_dict[
                                    'rm_suspend_rec_reason_cd'] = suspend_record_code

                            else:
                                logger.error(
                                    f'\t{dq_key} failed {data_quality_info.type}. Replacement value is {data_quality_info.replacement_value}.'
                                )
                                # find replacement and use it if needed
                                out_dict[
                                    dq_key] = data_quality_info.replacement_value
                    #pdb.set_trace()
            else:
                # if there are no dq checks, output the pp value to dq
                out_dict[dq_key] = val
        return out_dict