def test_split_can_use_non_integer_indices(self): expected_trains = [ pd.DataFrame(data={'variable': [4, 5, 6, 7, 8, 9]}, index=['4', '5', '6', '7', '8', '9']), pd.DataFrame(data={'variable': [0, 1, 2, 3, 7, 8, 9]}, index=['0', '1', '2', '3', '7', '8', '9']), pd.DataFrame(data={'variable': [0, 1, 2, 3, 4, 5, 6]}, index=['0', '1', '2', '3', '4', '5', '6']) ] expected_tests = [ pd.DataFrame({'variable': [0, 1, 2, 3]}, index=['0', '1', '2', '3']), pd.DataFrame({'variable': [4, 5, 6]}, index=['4', '5', '6']), pd.DataFrame({'variable': [7, 8, 9]}, index=['7', '8', '9']) ] data = data_preparation.InferenceData( pd.DataFrame( data={ 'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], }, index=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])) iterator = zip(data.split(cross_validation=3), expected_trains, expected_tests) for (train_data, test_data), expected_train, expected_test in iterator: pd.testing.assert_frame_equal(train_data.data, expected_train, check_dtype=False) pd.testing.assert_frame_equal(test_data.data, expected_test, check_dtype=False)
def test_zscored_input_raises_warning(self): data = pd.DataFrame( data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0], [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]], columns=['variable_0', 'variable_1', 'variable_2', 'variable_3']) data = data.apply(stats.zscore).fillna(0) inference_data = data_preparation.InferenceData(data) with self.assertWarns(Warning): _ = inference_data.address_low_variance()
def test_minmaxscaling_with_invalid_threshold_raises_warning(self): data = pd.DataFrame( data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0], [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]], columns=['variable_0', 'variable_1', 'variable_2', 'variable_3']) inference_data = data_preparation.InferenceData(data) with self.assertWarns(Warning): _ = inference_data.address_low_variance(minmax_scaling=True, threshold=.5)
def test_impute_missing_values_replaced_with_mean(self): inference_data = data_preparation.InferenceData(self._missing_data) expected_result = pd.DataFrame(data=[[0.4000, 0.0000], [0.6000, 0.0000], [0.4000, 3.0000], [0.2000, 1.0000]], columns=['first', 'second']) result = inference_data.impute_missing_values(strategy='mean') pd.testing.assert_frame_equal(result, expected_result)
def test_encode_categorical_covariate_dummy_variable_2(self): data = pd.DataFrame( data=[[0.0, 1.0, 'a', 10.0], [0.0, 1.0, 'b', 10.0], [1.0, 1.0, 'c', 5.00], [1.0, 0.0, 'a', 0.00]], columns=['control', 'variable_1', 'variable_2', 'outcome']) expected_result = pd.DataFrame(data=[[0.0, 1.0, 10.0, 1, 0, 0], [0.0, 1.0, 10.0, 0, 1, 0], [1.0, 1.0, 5.00, 0, 0, 1], [1.0, 0.0, 0.00, 1, 0, 0]], columns=[ 'control', 'variable_1', 'outcome', 'variable_2_a', 'variable_2_b', 'variable_2_c' ]) inference_data = data_preparation.InferenceData( data, target_column='outcome') result = inference_data.encode_categorical_covariates( columns=['variable_2']) pd.testing.assert_frame_equal(result, expected_result) data = pd.DataFrame( data=[[0.0, 1.0, 'a', 10.0], [0.0, 1.0, 'b', 10.0], [1.0, 1.0, 'c', 5.00], [1.0, 0.0, 'a', 0.00]], columns=['control', 'variable_1', 'variable_2', 'outcome']) expected_result = pd.DataFrame(data=[[0.0, 1.0, 10.0, 0, 0], [0.0, 1.0, 10.0, 1, 0], [1.0, 1.0, 5.00, 0, 1], [1.0, 0.0, 0.00, 0, 0]], columns=[ 'control', 'variable_1', 'outcome', 'variable_2_b', 'variable_2_c' ]) inference_data = data_preparation.InferenceData( data, target_column='outcome') result = inference_data.encode_categorical_covariates( columns=['variable_2'], drop_first=True) pd.testing.assert_frame_equal(result, expected_result)
def test_address_collinearity_with_vif_removes_column(self): iris = datasets.load_iris() iris_data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) expected_result = iris_data.drop(columns='petal length (cm)') inference_data = data_preparation.InferenceData(iris_data, target_column='target') result = inference_data.address_collinearity_with_vif( sequential=True, interactive=False, drop=True) pd.testing.assert_frame_equal(result, expected_result)
def test_vif_raises_error_on_ill_conditioned_correlation_matrix(self): ill_conditioned_correlation_matrix_df = pd.DataFrame( data=[[1.0, 2.0, 3.0, 4.0, 1.0], [0.0, 2.0, 0.0, 1.0, 1.0], [1.0, 1.0, 2.0, 5.0, 1.0], [0.0, 2.0, 3.0, 0.0, 1.0]], columns=[ 'control', 'variable_1', 'variable_2', 'variable_3', 'outcome' ]) inference_data = data_preparation.InferenceData( ill_conditioned_correlation_matrix_df, target_column='outcome') with self.assertRaises(data_preparation.SingularDataError): inference_data.address_collinearity_with_vif( handle_singular_data_errors_automatically=False)
def test_fixed_effect_raise_exception_on_categorical_covariate(self): data = pd.DataFrame( data=[['0', 0.0, '1', 3.0], ['1', 0.0, '2', 2.0], ['1', 1.0, '3', 2.0], ['1', 1.0, '4', 1.0]], columns=['control_1', 'control_2', 'variable_1', 'variable_2'], index=['group1', 'group2', 'group3', 'group3']) inference_data = data_preparation.InferenceData(data) with self.assertRaises(data_preparation.CategoricalCovariateError): inference_data.control_with_fixed_effect( strategy='quick', control_columns=['control_1', 'control_2'], min_frequency=1)
def test_descretize(self, equal_sized_bins, numeric, expected_result): data = data_preparation.InferenceData( pd.DataFrame(data=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20], columns=['variable'])) result = data.discretize_numeric_covariate( 'variable', equal_sized_bins=equal_sized_bins, bins=5, numeric=numeric) pd.testing.assert_frame_equal(result, expected_result, check_dtype=False)
def test_address_low_variance_removes_column(self): data = pd.DataFrame( data=[[0.0, 1.0, 0.0, 10.0], [0.0, 1.0, 0.0, 10.0], [1.0, 1.0, 0.0, 5.00], [1.0, 0.0, 0.0, 0.00]], columns=['control', 'variable', 'variable_1', 'outcome']) expected_result = pd.DataFrame( data=[[0.0, 1.0, 10.0], [0.0, 1.0, 10.0], [1.0, 1.0, 5.00], [1.0, 0.0, 0.00]], columns=['control', 'variable', 'outcome']) inference_data = data_preparation.InferenceData( data, target_column='outcome') result = inference_data.address_low_variance(drop=True) pd.testing.assert_frame_equal(result, expected_result)
def test_minmaxscaling_drops_appropriate_variables(self, scaling): data = pd.DataFrame( data=[[0.0, 1.0, 0.0, 10.0], [-0.5, 1.0, 0.0, 10.0], [0.1, 1.0, 0.0, 5.00], [0.2, 0.0, 0.0, 0.00]], columns=['variable_0', 'variable_1', 'variable_2', 'outcome']) data = data * scaling expected_result = data[['variable_1', 'outcome']] inference_data = data_preparation.InferenceData(data) result = inference_data.address_low_variance( threshold=.15, drop=True, minmax_scaling=True, ) pd.testing.assert_frame_equal(result, expected_result)
def test_split_without_groups_yields_expected_folds( self, cross_validation, expected_trains, expected_tests): data = data_preparation.InferenceData( pd.DataFrame({ 'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], })) iterator = zip(data.split(cross_validation=cross_validation), expected_trains, expected_tests) for (train_data, test_data), expected_train, expected_test in iterator: pd.testing.assert_frame_equal(train_data.data, expected_train, check_dtype=False) pd.testing.assert_frame_equal(test_data.data, expected_test, check_dtype=False)
def _prepare_data_and_target(ready_for_modelling=True): # Prepare data data = np.array( [[0.496714150, -0.13826430, 0.647688540, 1.523029860, -0.23415337], [-0.23413696, 1.579212820, 0.767434730, -0.46947439, 0.542560040], [-0.46341769, -0.46572975, 0.241962270, -1.91328024, -1.72491783], [-0.56228753, -1.01283112, 0.314247330, -0.90802408, -1.41230370], [1.465648770, -0.22577630, 0.067528200, -1.42474819, -0.54438272], [0.110922590, -1.15099358, 0.375698020, -0.60063869, -0.29169375], [-0.60170661, 1.852278180, -0.01349722, -1.05771093, 0.822544910], [-1.22084365, 0.208863600, -1.95967012, -1.32818605, 0.196861240], [0.738466580, 0.171368280, -0.11564828, -0.30110370, -1.47852199], [-0.71984421, -0.46063877, 1.057122230, 0.343618290, -1.76304016], [0.324083970, -0.38508228, -0.67692200, 0.611676290, 1.030999520], [0.931280120, -0.83921752, -0.30921238, 0.331263430, 0.975545130], [-0.47917424, -0.18565898, -1.10633497, -1.19620662, 0.812525820], [1.356240030, -0.07201012, 1.003532900, 0.361636030, -0.64511975], [0.361395610, 1.538036570, -0.03582604, 1.564643660, -2.61974510], [0.821902500, 0.087047070, -0.29900735, 0.091760780, -1.98756891], [-0.21967189, 0.357112570, 1.477894040, -0.51827022, -0.80849360], [-0.50175704, 0.915402120, 0.328751110, -0.52976020, 0.513267430], [0.097077550, 0.968644990, -0.70205309, -0.32766215, -0.39210815], [-1.46351495, 0.296120280, 0.261055270, 0.005113460, -0.23458713]]) # Decreasing coefficients with alternated signs idx = np.arange(data.shape[1]) coefficients = (-1) ** idx * np.exp(-idx / 10) coefficients[10:] = 0 # sparsify target = np.dot(data, coefficients) # Add noise noise = np.array( [0.496714150, -0.13826430, 0.64768854, 1.523029860, -0.23415337, -0.23413696, 1.579212820, 0.76743473, -0.46947439, 0.542560040, -0.46341769, -0.46572975, 0.24196227, -1.91328024, -1.72491783, -0.56228753, -1.01283112, 0.31424733, -0.90802408, -1.41230370]) target += 0.01 * noise data = pd.DataFrame(data) data['target'] = target inference_data = data_preparation.InferenceData(data, 'target') if ready_for_modelling: inference_data._has_control_factors = True inference_data._checked_low_variance = True inference_data._checked_collinearity = True return inference_data
def test_vif_noise_injection_catches_perfect_correlation(self): iris = datasets.load_iris() iris_data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target']) iris_data['perfectly_correlated_column'] = iris_data[ 'petal length (cm)'] expected_result = iris_data.drop( columns=['petal length (cm)', 'perfectly_correlated_column']) inference_data = data_preparation.InferenceData(iris_data, target_column='target') result = inference_data.address_collinearity_with_vif( vif_method='quick', drop=True, handle_singular_data_errors_automatically=True, vif_threshold=50.0) pd.testing.assert_frame_equal(result, expected_result)
def test_vif_noise_injection_fails_correctly_when_too_few_samples(self): too_few_samples_df = pd.DataFrame(data=[[1.0, 2.0, 3.0, 4.0, 1.0], [0.0, 2.0, 0.0, 1.0, 1.0], [1.0, 1.0, 2.0, 5.0, 1.0]], columns=[ 'control', 'variable_1', 'variable_2', 'variable_3', 'outcome' ]) inference_data = data_preparation.InferenceData( too_few_samples_df, target_column='outcome') expected_regex = ( 'Automatic attempt to resolve SingularDataError by ' 'injecting artifical noise to the data has failed. This ' 'probably means the dataset has too many features relative ' 'to the number of samples.') with self.assertRaisesRegex(data_preparation.SingularDataError, expected_regex): inference_data.address_collinearity_with_vif( handle_singular_data_errors_automatically=True)
def test_split_with_groups_yields_expected_folds_with_non_overlaping_groups( self, cross_validation, groups, expected_trains, expected_tests): data = data_preparation.InferenceData( pd.DataFrame({ 'variable': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], })) iterator = zip( data.split(cross_validation=cross_validation, groups=groups), expected_trains, expected_tests) for (train_data, test_data), expected_train, expected_test in iterator: train_groups = set(groups[train_data.data.index.tolist()]) test_groups = set(groups[test_data.data.index.tolist()]) pd.testing.assert_frame_equal(train_data.data, expected_train, check_dtype=False) pd.testing.assert_frame_equal(test_data.data, expected_test, check_dtype=False) self.assertEmpty(train_groups.intersection(test_groups))
def test_fixed_effect_demeaning_subtract_mean_in_groups(self): data = pd.DataFrame( data=[['0', 0.0, 1, 3.0], ['1', 0.0, 2, 2.0], ['1', 1.0, 3, 2.0], ['1', 1.0, 4, 1.0]], columns=['control_1', 'control_2', 'variable_1', 'variable_2'], index=['group1', 'group2', 'group3', 'group3']) expected_result = pd.DataFrame(data=[['0', 0.0, 2.5, 2.0], ['1', 0.0, 2.5, 2.0], ['1', 1.0, 2.0, 2.5], ['1', 1.0, 3.0, 1.5]], columns=data.columns, index=data.index).set_index( ['control_1', 'control_2'], append=True) inference_data = data_preparation.InferenceData(data) result = inference_data.control_with_fixed_effect( strategy='quick', control_columns=['control_1', 'control_2'], min_frequency=1) pd.testing.assert_frame_equal(result, expected_result)
def test_address_collinearity_with_vif_interactive(self, user_inputs, expected_dropped, sequential): dataframe = pd.DataFrame(data=[[1.1, 2.1, 3.1, 4.1, 0], [1.0, 2.0, 3.0, 4.0, 0], [1.0, 2.0, 3.0, 4.0, 0], [1.0, 2.0, 3.0, 4.0, 1]], columns=['1', '2', '3', '4', 'target']) data = data_preparation.InferenceData(dataframe, target_column='target') with mock.patch.object(data_preparation, '_input_mock') as input_mock: # Avoid Colab\Notebook prints in tests output with mock.patch.object(data_preparation, '_print_mock') as _: user_inputs = list(reversed(user_inputs)) input_mock.side_effect = lambda x: user_inputs.pop() result = data.address_collinearity_with_vif( sequential=sequential, interactive=True, drop=True) pd.testing.assert_frame_equal(result, dataframe.drop(expected_dropped, axis=1))
def test_vif_error_has_correct_message(self): ill_conditioned_correlation_matrix_df = pd.DataFrame( data=[[1.0, 2.0, 3.0, 4.0, 1.0], [0.0, 2.0, 0.0, 1.0, 1.0], [1.0, 1.0, 2.0, 5.0, 1.0], [0.0, 2.0, 3.0, 0.0, 1.0]], columns=[ 'control', 'variable_1', 'variable_2', 'variable_3', 'outcome' ]) inference_data = data_preparation.InferenceData( ill_conditioned_correlation_matrix_df, target_column='outcome') expected_message = ( 'Inference Data has a singular or nearly singular correlation matrix. ' 'This could be caused by extremely correlated or collinear columns. ' 'The three pairs of columns with the highest absolute correlation ' 'coefficients are: (control,variable_3): 0.970, (variable_1,variable_3)' ': -0.700, (control,variable_1): -0.577. This could also be caused by ' 'columns with extremiely low variance. Recommend running the ' 'address_low_variance() method before VIF. Alternatively, consider ' 'running address_collinearity_with_vif() with ' 'use_correlation_matrix_inversion=False to avoid this error.') with self.assertRaises(data_preparation.SingularDataError, msg=expected_message): inference_data.address_collinearity_with_vif( handle_singular_data_errors_automatically=False)
def test_invalid_target_column_raise_exception(self): with self.assertRaises(KeyError): data_preparation.InferenceData(initial_data=self._missing_data, target_column='non_ci_sono')
def test_check_data_raises_exception_on_missing_data(self): inference_data = data_preparation.InferenceData(self._missing_data) with self.assertRaises(data_preparation.MissingValueError): inference_data.data_check(raise_on_error=True)
def test_missing_value_emits_warning_twice(self): with self.assertWarns(data_preparation.MissingValueWarning): data_preparation.InferenceData(self._missing_data) with self.assertWarns(data_preparation.MissingValueWarning): data_preparation.InferenceData(self._missing_data)
def test_vif_method_fails_correctly_with_unknown_value(self): inference_data = data_preparation.InferenceData(self._missing_data) with self.assertRaises(ValueError): inference_data.address_collinearity_with_vif( vif_method='incorrect_value')