def test_discretize_features_one_feature(): """Test discretize_features for just one feature.""" recs = [{ 'test_feature_1': 1, 'test_feature_2': 1, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 1, 'test_class': 1 }, { 'test_feature_1': 3, 'test_feature_2': 3, 'test_class': 1 }, { 'test_feature_1': 5, 'test_feature_2': 5, 'test_class': 0 }, { 'test_feature_1': 5, 'test_feature_2': 5, 'test_class': 0 }] test_df = pd.DataFrame(recs) obj = DataWrangling() return_val = obj.discretize_features(data_frame=test_df, features=['test_feature_1'], n_bins=3) expected_val = np.array([[1., 0., 0.], [1., 0., 0.], [0., 1., 0.], [0., 0., 1.], [0., 0., 1.]]) assert np.array_equal(return_val, expected_val)
def test_fill_nul_vals(): """Test fill_null_vals().""" recs = [{ 'test_feature_1': 0, 'test_feature_2': 0, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 5, 'test_class': 1 }, { 'test_feature_1': 2, 'test_feature_2': 0, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 3, 'test_class': 0 }, { 'test_feature_1': 0, 'test_feature_2': 2, 'test_class': 0 }] test_df = pd.DataFrame(recs) obj = DataWrangling() return_val = obj.fill_nul_vals( data_frame=test_df, class_column='test_class', features=['test_feature_1', 'test_feature_2']) recs = [{ 'test_feature_1': 1.5, 'test_feature_2': 5.0, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 5, 'test_class': 1 }, { 'test_feature_1': 2, 'test_feature_2': 5.0, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 3, 'test_class': 0 }, { 'test_feature_1': 1.0, 'test_feature_2': 2, 'test_class': 0 }] expected_val = pd.DataFrame(recs) assert return_val.equals(expected_val)
def test_split_train_test(): """Test split_train_test().""" recs = [{ 'test_feature_1': 1, 'test_feature_2': 1, 'test_class': 1 }, { 'test_feature_1': 1, 'test_feature_2': 1, 'test_class': 1 }, { 'test_feature_1': 3, 'test_feature_2': 3, 'test_class': 1 }, { 'test_feature_1': 5, 'test_feature_2': 5, 'test_class': 0 }, { 'test_feature_1': 5, 'test_feature_2': 5, 'test_class': 0 }] test_df = pd.DataFrame(recs) features = ['test_feature_1', 'test_feature_2'] test_size = 0.33 class_col = 'test_class' obj = DataWrangling() return_val = obj.split_train_test(x_data=test_df[features].to_numpy(), y_labels=test_df[class_col].to_numpy(), features=features, test_size=test_size) expected_keys = ['x_train', 'y_train', 'x_test', 'y_test'] assert all(key in return_val for key in expected_keys) assert all( isinstance(val, (pd.Series, pd.DataFrame)) for _, val in return_val.items())
def test_sklearn_prediction_score(): global data, features, class_col, n_bins, test_size, no_zero_features obj = DataWrangling() splits = preprocess_data(obj=obj, data=data, features=features, no_zero_features=no_zero_features, class_col=class_col, n_bins=n_bins, test_size=test_size) prediction_score = get_sklearn_prediction_score(splits) splits = splits['x_train'] assert isinstance(prediction_score, float) assert prediction_score >= 0.0 assert prediction_score <= 1.0
def test_preprocess_data(): global data, features, class_col, n_bins, test_size, no_zero_features obj = DataWrangling() return_val = preprocess_data(obj=obj, data=data, features=features, no_zero_features=no_zero_features, class_col=class_col, n_bins=n_bins, test_size=test_size) expected_keys = ['x_train', 'y_train', 'x_test', 'y_test'] assert all(key in return_val for key in expected_keys) assert all( isinstance(val, (pd.Series, pd.DataFrame)) for _, val in return_val.items()) assert return_val['x_train'].shape[0] == 4 assert return_val['x_test'].shape[0] == 2 assert return_val['x_train'].shape[1] == 6
def get_data(n_bins: int) -> dict: """Retrieve data and perform preprocessing steps and return result. Args: n_bins (int): Number of bins to discretize features into. Returns: dict: Dictionary containing test and training sets and labels. """ dframe = pd.read_csv(TEST_DATA_PATH) obj1 = DataWrangling(data_path=TEST_DATA_PATH) obj1.load_data() features = [ 'Pglucose', 'Preg', 'Dbp', 'BMI', 'Dpfunc', 'Age', 'Insulin', 'Skin' ] input_df = obj1.data[features + ['Response']].astype(np.float64) no_zero_vals = obj1.fill_nul_vals(data_frame=input_df, class_column='Response', features=features) new_feature_list = [] for feature in features: for bin_val in range(1, n_bins + 1): new_feature_list.append(feature + str(bin_val)) discr_arr = obj1.discretize_features(data_frame=no_zero_vals, features=features, n_bins=n_bins) discr_df = pd.DataFrame(discr_arr, columns=new_feature_list) discr_df['Response'] = dframe['Response'] splits = obj1.split_train_test( x_data=discr_df[new_feature_list].to_numpy(), y_labels=discr_df['Response'].to_numpy(), features=new_feature_list, test_size=0.25) return splits
def test_load_data(): """Test load_data().""" obj = DataWrangling(data_path='src/data/data.csv') obj.load_data() assert all(obj.data.columns == pd.read_csv('src/data/data.csv').columns)