def test_cleancategorical_replacemissingnewcategory_noparams(self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category() validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1, "Green", 2], [1, "Other", 1], [-1, "Unknown", -1]])
def test_cleancategorical_replacemissingnewcategory_dict(self): missing_data = [[1, "Green", 2], [1, np.nan, 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) category_dict_mapping = {"col1": 2, "col2": "Blue", "col3": 4} clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(col_mapping=category_dict_mapping) validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1.0, "Green", 2.0], [1.0, "Blue", 1.0], [2.0, "Blue", 4.0]])
def test_cleancategorical_replacemissingnewcategory_list_constantnotnone( self): missing_data = np.array([(1, "Green", 2), (1, "Other", 1), (None, None, None)]) columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) list_col = ["col1", "col3"] clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(list_of_cols=list_col, new_category=0) validate = clean.x_train.values.tolist() self.assertListEqual( validate, np.array([(1, "Green", 2), (1, "Other", 1), (0, None, 0)]).tolist(), )
def test_cleancategorical_replacemissingnewcategory_list_constantisnone( self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, None, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) list_col = ["col1", "col2"] clean = Clean(x_train=data, test_split_percentage=0.5, split=False) clean.replace_missing_new_category(list_of_cols=list_col) # Replacing NaNs with strings for validations as regular assert does == and to compare NaNs you need `is` clean._data_properties.x_train = clean.x_train.fillna("NaN was here") validate = clean.x_train.values.tolist() self.assertListEqual( validate, [[1, "Green", 2.0], [1, "Other", 1.0], [-1, "Unknown", "NaN was here"]], )
def test_report_cleaning_new_category(self): missing_data = [[1.0, "Green", 2], [1.0, "Other", 1], [np.nan, np.nan, np.nan]] columns = ["col1", "col2", "col3"] data = pd.DataFrame(missing_data, columns=columns) clean = Clean(x_train=data, test_split_percentage=0.5, split=False, report_name="test") clean_data = clean.replace_missing_new_category() with open(clean._data_properties.report.filename) as f: content = f.read() validate = "col1" in content and "col2" in content and "col3" in content os.remove(clean._data_properties.report.filename) self.assertTrue(validate)