def test_40_test_impute(self): """(40) Test impute validation""" msgt(self.test_40_test_impute.__doc__) spec_props = self.spec_props dp_mean = DPMeanSpec(spec_props) self.assertTrue(dp_mean.is_chain_valid()) bad_impute_info = [(-10, astatic.ERR_IMPUTE_PHRASE_MIN), (45, astatic.ERR_IMPUTE_PHRASE_MAX), (5.2, astatic.ERR_IMPUTE_PHRASE_MAX)] for bad_impute, stat_err_msg in bad_impute_info: print(f'> bad impute: {bad_impute}') new_props = spec_props.copy() new_props['fixed_value'] = bad_impute dp_mean2 = DPMeanSpec(new_props) self.assertFalse(dp_mean2.is_chain_valid()) err_dict = dp_mean2.get_error_msg_dict() print(f" - {err_dict['message']}") self.assertTrue(err_dict['message'].find(stat_err_msg) > -1) good_impute_info = [-8, 5, '-8.0', '5.0000', -7, 0, '0.0'] for good_impute in good_impute_info: print(f'> good impute: {good_impute}') new_props = spec_props.copy() new_props['fixed_value'] = good_impute dp_mean = DPMeanSpec(new_props) self.assertTrue(dp_mean.is_chain_valid())
def test_30_bad_confidence_levels(self): """(30) Bad confidence level vals""" msgt(self.test_30_bad_confidence_levels.__doc__) spec_props = self.spec_props def float_range(start, stop, step): while start < stop: yield float(start) start += decimal.Decimal(step) for cl_val in list(float_range(-1, 3, '0.08')): #print(f'> Invalid ci val: {ci_val}') spec_props['cl'] = cl_val dp_mean = DPMeanSpec(spec_props) #print(dp_mean.is_chain_valid()) self.assertFalse(dp_mean.is_chain_valid()) self.assertTrue(dp_mean.get_single_err_msg().find( VALIDATE_MSG_NOT_VALID_CL_VALUE) > -1) for cl_val in ['alphabet', 'soup', 'c']: #print(f'> Invalid ci val: {ci_val}') spec_props['cl'] = cl_val dp_mean = DPMeanSpec(spec_props) #print(dp_mean.is_chain_valid()) self.assertFalse(dp_mean.is_chain_valid()) self.assertTrue(dp_mean.get_single_err_msg().find( 'Failed to convert "cl" to a float') > -1)
def test_100_run_dpmean_calculation(self): """(100) Run DP mean calculation""" msgt(self.test_100_run_dpmean_calculation.__doc__) spec_props = self.spec_props dp_mean = DPMeanSpec(spec_props) self.assertTrue(dp_mean.is_chain_valid()) if dp_mean.has_error(): print(dp_mean.get_error_messages()) return #print('\nUI info:', json.dumps(dp_mean.get_success_msg_dict())) # ------------------------------------------------------ # Run the actual mean # ------------------------------------------------------ # Column indexes - We know this data has 20 columns col_indexes = [idx for idx in range(0, 20)] # File object # eye_fatigue_filepath = join(TEST_DATA_DIR, 'Fatigue_data.tab') #print('eye_fatigue_filepath', eye_fatigue_filepath) self.assertTrue(isfile(eye_fatigue_filepath)) file_obj = open(eye_fatigue_filepath, 'r') # Call run_chain # dp_mean.run_chain(col_indexes, file_obj, sep_char="\t") final_dict = dp_mean.get_release_dict() self.assertIn('description', final_dict) self.assertIn('text', final_dict['description']) self.assertIn('html', final_dict['description']) print('Actual mean: -0.9503854412185792')
def test_10_valid_spec(self): """(10) Run DP Mean valid spec""" msgt(self.test_10_valid_spec.__doc__) spec_props = { 'variable': 'EyeHeight', 'col_index': 19, 'statistic': astatic.DP_MEAN, 'dataset_size': 183, 'epsilon': 1.0, 'delta': 0.0, 'cl': astatic.CL_95, #'accuracy': None, 'missing_values_handling': astatic.MISSING_VAL_INSERT_FIXED, 'fixed_value': '5', 'variable_info': { 'min': -8, 'max': 5, 'type': 'Float', }, } dp_mean = DPMeanSpec(spec_props) self.assertTrue(dp_mean.is_chain_valid()) for epsilon_val in [0.1, .25, .65, .431, 1.0]: print(f'> Valid epsilon val: {epsilon_val}') spec_props['epsilon'] = epsilon_val dp_mean = DPMeanSpec(spec_props) self.assertTrue(dp_mean.is_chain_valid()) print(' --------') for cl_val in [x[0] for x in astatic.CL_CHOICES]: print(f'> Valid CL val: {cl_val}') spec_props['cl'] = cl_val dp_mean = DPMeanSpec(spec_props) self.assertTrue(dp_mean.is_chain_valid()) print(' --------') for good_ds in [ 1, 2, 10, 100, 56**3, ]: spec_props['dataset_size'] = good_ds dp_mean = DPMeanSpec(spec_props) print(f'> Valid dataset_size: {good_ds}') self.assertTrue(dp_mean.is_chain_valid())
def test_10_debug_mean(self): """(10) Test DP Mean Spec""" msgt(self.test_10_debug_mean.__doc__) spec_props = self.spec_props dp_mean = DPMeanSpec(spec_props) print('(1) Run initial check, before using the OpenDp library') print(' - Error found?', dp_mean.has_error()) if dp_mean.has_error(): print('\n-- Errors --') print(dp_mean.get_error_messages()) print('\nUI info:', json.dumps(dp_mean.get_error_msg_dict())) return print('(2) Use the OpenDP library to check validity') print(' - Is valid?', dp_mean.is_chain_valid()) if dp_mean.has_error(): print('\n-- Errors --') print(dp_mean.get_error_messages()) print('\nUI info:', json.dumps(dp_mean.get_error_msg_dict())) else: print('\n-- Looks good! --') print('\nUI info:', json.dumps(dp_mean.get_success_msg_dict()))
def test_110_run_dpmean_calculation(self): """(110) Run another DP mean calculation""" msgt(self.test_110_run_dpmean_calculation.__doc__) spec_props = self.spec_props_income dp_mean = DPMeanSpec(spec_props) print('Is this spec valid?', dp_mean.is_chain_valid()) if dp_mean.has_error(): print(dp_mean.get_error_messages()) print(dp_mean.get_error_msg_dict()) return self.assertTrue(dp_mean.is_chain_valid()) # ------------------------------------------------------ # Run the actual mean # ------------------------------------------------------ # Column indexes - We know this data has 11 columns col_indexes = [idx for idx in range(0, 11)] # File object # pums_filepath = join(TEST_DATA_DIR, 'PUMS5extract10000.csv') self.assertTrue(isfile(pums_filepath)) file_like_obj = open(pums_filepath, 'r') # Call run_chain # dp_mean.run_chain(col_indexes, file_like_obj) if dp_mean.has_error(): print(dp_mean.get_error_messages()) return final_dict = dp_mean.get_release_dict() json_str = json.dumps(final_dict, indent=4) print(json_str) print('-- actual vals --') print(('mean: 30,943.4566' '\nmin: -10,000.0' '\nmax: 713,000.0')) self.assertIn('description', final_dict) self.assertIn('text', final_dict['description']) self.assertIn('html', final_dict['description'])
def test_35_check_confidence_level_alpha(self): """(35) Check accuracy with bad confidence level""" msgt(self.test_35_check_confidence_level_alpha.__doc__) # shouldn't happen, change cl after validity # spec_props_income = self.spec_props_income.copy() dp_mean = DPMeanSpec(spec_props_income) self.assertTrue(dp_mean.is_chain_valid()) self.assertEqual(dp_mean.get_confidence_level_alpha(), astatic.CL_99_ALPHA) # Set CL to None -- shouldn't happen, would be caught in the __init__ # dp_mean.cl = None cl_alpha = dp_mean.get_confidence_level_alpha() self.assertIsNone(cl_alpha) self.assertTrue(dp_mean.has_error()) self.assertTrue(dp_mean.get_single_err_msg().startswith( astatic.ERR_MSG_CL_ALPHA_CL_NOT_SET)) # Set CL to non numeric -- shouldn't happen, would be caught in the __init__ # spec_props_income2 = self.spec_props_income.copy() dp_mean = DPMeanSpec(spec_props_income) self.assertTrue(dp_mean.is_chain_valid()) dp_mean.cl = 'zebra' cl_alpha = dp_mean.get_confidence_level_alpha() self.assertIsNone(cl_alpha) self.assertTrue(dp_mean.has_error()) self.assertTrue(dp_mean.get_single_err_msg().startswith( astatic.ERR_MSG_CL_ALPHA_CL_NOT_NUMERIC)) # Set CL to 2.0 -- shouldn't happen, would be caught in the __init__ # spec_props_income3 = self.spec_props_income.copy() dp_mean = DPMeanSpec(spec_props_income3) self.assertTrue(dp_mean.is_chain_valid()) dp_mean.cl = 2.0 cl_alpha = dp_mean.get_confidence_level_alpha() self.assertIsNone(cl_alpha) self.assertTrue(dp_mean.has_error()) self.assertTrue(dp_mean.get_single_err_msg().startswith( astatic.ERR_MSG_CL_ALPHA_CL_LESS_THAN_0)) # Set CL to -1 -- shouldn't happen, would be caught in the __init__ # spec_props_income3 = self.spec_props_income.copy() dp_mean = DPMeanSpec(spec_props_income3) self.assertTrue(dp_mean.is_chain_valid()) dp_mean.cl = -1.0 cl_alpha = dp_mean.get_confidence_level_alpha() self.assertIsNone(cl_alpha) self.assertTrue(dp_mean.has_error()) self.assertTrue(dp_mean.get_single_err_msg().startswith( astatic.ERR_MSG_CL_ALPHA_CL_GREATER_THAN_1))
def test_20_bad_epsilon(self): """(20) Bad epsilon""" msgt(self.test_20_bad_epsilon.__doc__) spec_props = self.spec_props for epsilon_val in [1.01, -0.01, 10]: print(f'> Bad epsilon val: {epsilon_val}') spec_props['epsilon'] = epsilon_val dp_mean = DPMeanSpec(spec_props) self.assertFalse(dp_mean.is_chain_valid()) err_info = dp_mean.get_error_msg_dict() self.assertTrue(err_info['valid'] == False) print(err_info['message']) self.assertTrue( err_info['message'].find(VALIDATE_MSG_EPSILON) > -1) for epsilon_val in ['a', 'carrot', 'cake']: print(f'> Bad epsilon val: {epsilon_val}') spec_props['epsilon'] = epsilon_val dp_mean = DPMeanSpec(spec_props) self.assertFalse(dp_mean.is_chain_valid()) err_info = dp_mean.get_error_msg_dict() self.assertTrue(err_info['valid'] == False) print(err_info['message']) self.assertTrue(err_info['message'].find('Failed to convert') > -1) spec_props['epsilon'] = 1 for bad_ds in [-1, 0, 1.0, .03, 'brick', 'cookie']: print(f'> Bad dataset_size: {bad_ds}') spec_props['dataset_size'] = bad_ds dp_mean = DPMeanSpec(spec_props) self.assertFalse(dp_mean.is_chain_valid())
def build_stat_specs(self): """ Build a list of StatSpec subclasses that can be used for chain validation or running computations """ # Iterate through the stats! # self.stat_spec_list = [] stat_num = 0 # track total epsilon # running_epsilon = 0 for dp_stat in self.dp_statistics: stat_num += 1 # not used yet... """ We're putting together lots of properties to pass to statistic specific classes such as DPMeanSpec. These classes take care of most error checking and validation. - Some sample input from the UI--e.g. contents of "dp_stat: { "statistic": astatic.DP_MEAN, "variable": "EyeHeight", "epsilon": 1, "delta": 0, "error": "", "missing_values_handling": astatic.MISSING_VAL_INSERT_FIXED, "handle_as_fixed": False, "fixed_value": "5.0", "locked": False, "label": "EyeHeight"}, """ # ------------------------------------- # (1) Begin building the property dict # ------------------------------------- props = dp_stat # start with what is in dp_stat--the UI input props['dataset_size'] = self.dataset_size # add dataset size # Some high-level error checks, before making the StatSpec # variable = props.get('variable') statistic = props.get('statistic', 'shrug?') epsilon = props.get('epsilon') # (1) Is variable defined? # if not props.get('variable'): props['error_message'] = (f'"variable" is missing from this' f'DP Stat specification.') self.add_stat_spec(DPSpecError(props)) continue # to the next dp_stat specification # (2) Is this a known statistic? If not stop here. # if statistic not in astatic.DP_STATS_CHOICES: # also checked in the DPStatisticSerializer props['error_message'] = f'Statistic "{statistic}" is not supported' self.add_stat_spec(DPSpecError(props)) continue # to the next dp_stat specification # (3) Add variable_info which has min/max/categories, variable type, etc. # variable_info = self.analysis_plan.variable_info.get(variable) if not variable_info: # Temp workaround!!! See Issue #300 # https://github.com/opendp/dpcreator/issues/300 variable_info = self.analysis_plan.variable_info.get(camel_to_snake(variable)) if variable_info: props['variable_info'] = variable_info else: props['error_message'] = 'Variable info not found.' self.add_stat_spec(DPSpecError(props)) continue # to the next dp_stat specification # (4) Retrieve the column index # col_idx_info = self.analysis_plan.dataset.get_variable_index(variable) if col_idx_info.success: props['col_index'] = col_idx_info.data else: props['error_message'] = col_idx_info.message self.add_stat_spec(DPSpecError(props)) continue # to the next dp_stat specification # Okay, "props" are built! Let's see if they work! # if statistic == astatic.DP_COUNT: # DP Count! self.add_stat_spec(DPCountSpec(props)) elif statistic in astatic.DP_HISTOGRAM: # DP Histogram! self.add_stat_spec(DPHistogramSpec(props)) elif statistic == astatic.DP_MEAN: # DP Mean! self.add_stat_spec(DPMeanSpec(props)) elif statistic == astatic.DP_SUM: # DP Mean! self.add_stat_spec(DPSumSpec(props)) elif statistic in astatic.DP_STATS_CHOICES: # Stat not yet available or an error props['error_message'] = (f'Statistic "{statistic}" will be supported' f' soon!') self.add_stat_spec(DPSpecError(props)) else: # Shouldn't reach here, unknown stats are captured up above pass