def test_data_quality_info_suspect_flag(self): json_config = { 'specific_dq_function': 'is_valid_length', 'specific_dq_function_param_1': '15', 'suspend_record': 'Yes', 'type': 'Length Check', 'always': '', 'only_if_data_exists': 'x', 'exception_message': 'Incorrect Length', 'replacement_value': 'N/A' } dq = DataQualityInfo(json_config) self.assertTrue(dq.suspend_record) json_config = { 'specific_dq_function': 'is_valid_length', 'specific_dq_function_param_1': '15', 'suspend_record': 'No', 'type': 'Length Check', 'always': '', 'only_if_data_exists': 'x', 'exception_message': 'Incorrect Length', 'replacement_value': 'N/A' } dq = DataQualityInfo(json_config) self.assertFalse(dq.suspend_record)
def test_data_quality_info_length_check(self): json_config = { 'specific_dq_function': 'is_valid_length', 'specific_dq_function_param_1': '15', 'suspend_record': 'Yes', 'type': 'Length Check', 'always': '', 'only_if_data_exists': 'x', 'exception_message': 'Incorrect Length', 'replacement_value': 'N/A' } dq = DataQualityInfo(json_config) self.assertTrue(dq.suspend_record) self.assertEqual('Incorrect Length', dq.exception_message) self.assertEqual(None, dq.replacement_value) # Too short self.assertFalse(dq.validate('')) self.assertFalse(dq.validate('ABCD')) self.assertFalse(dq.validate('12345678901234')) # Correct length self.assertTrue(dq.validate('ABCDEFGHIJKLMNO')) self.assertTrue(dq.validate('123456789012345')) # Too long self.assertFalse(dq.validate('ABCDEFGHIJKLMNOP')) self.assertFalse(dq.validate('1234567890123456')) self.assertFalse(dq.validate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
def test_get_suspend_record_code(self): # z00 tests for MIS key = "pp_z00_doc_number" mis_json_config = self.bib_rec_json_config['z00_doc_number'][ 'dataquality_info'][0] dq = DataQualityInfo(mis_json_config) result = DataQualityProcessor.get_suspend_record_code(key, dq) self.assertEqual("MIS", result) # z00 test for LEN len_json_config = self.bib_rec_json_config['z00_doc_number'][ 'dataquality_info'][1] dq = DataQualityInfo(len_json_config) result = DataQualityProcessor.get_suspend_record_code(key, dq) self.assertEqual("LEN", result) # z13 tests key = 'pp_z13_open_date' json_config = self.bib_rec_json_config['z13_open_date'][ 'dataquality_info'][0] dq = DataQualityInfo(json_config) result = DataQualityProcessor.get_suspend_record_code(key, dq) self.assertEqual("MIS", result)
def test_data_quality_info_is_numeric(self): json_config = { 'specific_dq_function': 'is_numeric', 'specific_dq_function_param_1': '', 'suspend_record': 'Yes', 'type': 'Numeric-only check', 'always': '', 'only_if_data_exists': 'x', 'exception_message': 'Non-numeric Value', 'replacement_value': 'N/A' } dq = DataQualityInfo(json_config) self.assertTrue(dq.suspend_record) self.assertEqual('Non-numeric Value', dq.exception_message) self.assertEqual(None, dq.replacement_value) self.assertFalse(dq.validate('ABCD')) self.assertTrue(dq.validate('1236717823681')) self.assertTrue(dq.validate('0000')) self.assertFalse(dq.validate('')) self.assertTrue(dq.validate('0')) self.assertFalse(dq.validate('a1d3'))
def test_data_quality_info_no_missing_values(self): json_config = { 'specific_dq_function': 'no_missing_values', 'specific_dq_function_param_1': '', 'suspend_record': 'No', 'type': 'Missing Value', 'always': 'x', 'only_if_data_exists': '', 'exception_message': 'Missing Value', 'replacement_value': 'Missing Value' } dq = DataQualityInfo(json_config) self.assertFalse(dq.suspend_record) self.assertEqual('Missing Value', dq.exception_message) self.assertEqual('Missing Value', dq.replacement_value) self.assertTrue(dq.validate('ABCD')) self.assertFalse(dq.validate(None)) self.assertFalse(dq.validate(' ')) self.assertFalse(dq.validate('')) self.assertFalse(dq.validate('0'))
def check_data_quality(cls, item, json_config, pk_list, logger): """ Takes values from 'pp_' fields and runs DQ checks, adding replacement values if needed. Suspends record if needed. """ # out dict to hold the processed item out_dict = {} invalid_keys = ['rec_type_cd', 'rec_trigger_key', '_sa_instance_state'] for key, value in item.items(): if key in invalid_keys: continue # add the pks to the out_dict so the row can be inserted later if key in pk_list: out_dict[key] = value # skip keys from invalid_keys and keys that aren't 'pp_' if not key.startswith('pp_'): continue # get DQ checks for current key dq_list = DataQualityProcessor.get_dq_checks_for_key( key, json_config) dq_key = key.replace('pp_', 'dq_') # keep track of dq exception number dq_exception_count = 0 #pdb.set_trace() # do DQ checks if exist if dq_list: for dq_check in dq_list: # create DataQualityInfo for each DQ check data_quality_info = DataQualityInfo(dq_check) if dq_check[ 'type'] == 'Date check' and key != 'pp_z13_open_date': continue elif dq_check[ 'type'] == 'Date check' and key == 'pp_z13_open_date': val = value.rstrip() if dq_exception_count == 1 and data_quality_info.only_if_data_exists: continue is_passing = data_quality_info.validate(val) #import pdb; pdb.set_trace() if is_passing: # write value to out_dict because it passes out_dict[dq_key] = val out_dict[ 'rm_dq_check_excptn_cnt'] = dq_exception_count else: dq_exception_count = dq_exception_count + 1 out_dict[ 'rm_dq_check_excptn_cnt'] = dq_exception_count logger.error( f'\t{dq_key} failed {data_quality_info.type}. Replacement value is {data_quality_info.replacement_value}.' ) # find replacement and use it if needed out_dict[ dq_key] = data_quality_info.replacement_value else: # trim trailing spaces of the value # might cause problems for sublibrary code and collection code val = value.rstrip() # determine if value passes check is_passing = data_quality_info.validate(val) # if the value has an exception count of 1, it likely has a missing value # skip the check if it has "only if data exists" flag if dq_exception_count == 1 and data_quality_info.only_if_data_exists: continue if is_passing: # write value to out_dict because it passes out_dict[dq_key] = val out_dict[ 'rm_dq_check_excptn_cnt'] = dq_exception_count else: # check for suspend record is True dq_exception_count = dq_exception_count + 1 out_dict[ 'rm_dq_check_excptn_cnt'] = dq_exception_count if data_quality_info.suspend_record: logger.error( f'\t{dq_key} with value of {val} failed {data_quality_info.type}. SUSPENDED' ) # out_dict for the current dq_ key contains same value. out_dict[dq_key] = 'SUS' # change suspend record flag suspend_record_flag = "Y" out_dict[ 'rm_suspend_rec_flag'] = suspend_record_flag # increment exception count #dq_exception_count = dq_exception_count + 1 #out_dict['rm_dq_check_excptn_cnt'] = dq_exception_count # get suspend record code suspend_record_code = DataQualityProcessor.get_suspend_record_code( dq_key, data_quality_info) out_dict[ 'rm_suspend_rec_reason_cd'] = suspend_record_code else: logger.error( f'\t{dq_key} failed {data_quality_info.type}. Replacement value is {data_quality_info.replacement_value}.' ) # find replacement and use it if needed out_dict[ dq_key] = data_quality_info.replacement_value #pdb.set_trace() else: # if there are no dq checks, output the pp value to dq out_dict[dq_key] = val return out_dict