def test_non_accepted_cat(): try: fill_missing(X_train, X_test, col_dict, num_imp="median", cat_imp="mean") except AssertionError: pass
def test_bad_col_dict(): try: fill_missing(X_train, X_test, column_dict=["cat1", "num1"], num_imp="median", cat_imp="mode") except AssertionError: pass
def test_bad_dict_keys(): try: fill_missing( X_train, X_test, # insert wrong column key column_dict={ 'numerical': ['num1'], 'categorical': ['cat1'] }, num_imp="median", cat_imp="mean") except AssertionError: pass
def test_no_name_columns(): try: fill_missing( X_train=pd.DataFrame([1.5, 2.5, 3.5, None, 4.5]), X_test=pd.DataFrame([1, None, 3, 1, 3]), # insert column which is not in the df column_dict={ 'numeric': ['num2'], 'categorical': ['cat1'] }, num_imp="median", cat_imp="mean") except AssertionError: pass
def test_bad_test_set(): try: fill_missing(X_train=pd.DataFrame({ 'cat1': [1, None, 3, 1, 3], 'num1': [1.5, 2.5, None, 2.0, 2.0] }), X_test=np.array(1), column_dict={ 'numeric': ['num1'], 'categorical': ['cat1'] }, num_imp="median", cat_imp="mode") except AssertionError: pass
def run_pylaundry(): """ Runs all modules of pylaundry Arguments -------- NA Returns ------ features_selected = list of final features selected """ col_dict = categorize(df=X_train) # second function - fill_missing clean_data = fill_missing(X_train, X_test, col_dict, num_imp="mean", cat_imp="mode") # third function - transform_columns transformed_data = transform_columns(clean_data['X_train'], clean_data['X_test'], col_dict) # fourth function - feature selection features_selected = select_features(transformed_data['X_train'], y_train, n_features=2) return features_selected
def test_diff_columns(): try: fill_missing(X_train=pd.DataFrame({ 'cat1': [1, 2, None, 1, 1], 'num2': [1.5, 2.5, 3.5, None, 4.5] }), X_test=pd.DataFrame({ 'cat1': [1, None, 3, 1, 3], 'num1': [1.5, 2.5, None, 2.0, 2.0] }), column_dict={ 'numeric': ['num1'], 'categorical': ['cat1'] }, num_imp="median", cat_imp="mean") except AssertionError: pass
def test_mode_cat(): output = fill_missing(X_train, X_test, col_dict, num_imp="median", cat_imp="mode") train_output = output['X_train'] test_output = output['X_test'] # Mean column 1 is 3 for imputed value # Check for same imputation train and test assert train_output["cat1"][2] == 1, \ "Imputed mode value should be 1 in train set" assert test_output["cat1"][1] == 1, \ "Imputed mode value should be 1 in test set"
def test_mean_num(): output = fill_missing(X_train, X_test, col_dict, num_imp="mean", cat_imp="mode") train_output = output['X_train'] test_output = output['X_test'] # Mean column 1 is 3 for imputed value # Check for same imputation train and test assert train_output["num1"][3] == 3, \ "Imputed mean value should be 3.0 in train set" assert test_output["num1"][2] == 3, \ "Imputed mean value should be 3.0 in test set"
def test_output_type(): output = fill_missing(X_train, X_test, col_dict, num_imp="mean", cat_imp="mode") train_output = output['X_train'] test_output = output['X_test'] # Check length and type of output assert len(output) == 2, \ "Output of fill_missing() should be two dataframes" assert isinstance(train_output, pd.DataFrame), \ "Training df should be a Pandas DF" assert isinstance(test_output, pd.DataFrame), \ "Test df should be a Pandas DF"
def test_non_numeric_columns(): output = fill_missing(X_train=pd.DataFrame({ 'cat1': ['a', 'b', None, 'c', 'a'], 'num1': [1.5, 2.5, 3.5, None, 4.5] }), X_test=pd.DataFrame({ 'cat1': ['a', 'b', None], 'num1': [1.5, None, 3.5], }), column_dict=col_dict, num_imp="mean", cat_imp="mode") train_output = output['X_train'] test_output = output['X_test'] # Mode of column 1 is 3 for imputed value # Check for same imputation train and test assert train_output["cat1"][2] == 'a', \ "Imputed mode value should be 'a' in train set" assert test_output["cat1"][2] == 'a', \ "Imputed mode value should be 'a' in test set"
def test_median_num(): output = fill_missing(X_train=pd.DataFrame({ 'cat1': [1, 2, None, 1, 1], 'num1': [1.5, 2.5, 3.5, None, 4.5] }), X_test=pd.DataFrame({ 'cat1': [1, None, 3, 1, 3], 'num1': [1.5, 2.5, None, 2.0, 2.0] }), column_dict={ 'numeric': ['num1'], 'categorical': ['cat1'] }, num_imp="median", cat_imp="mode") train_output = output['X_train'] test_output = output['X_test'] # Mean column 1 is 3 for imputed value # Check for same imputation train and test assert train_output["num1"][3] == 3, \ "Imputed median value should be 3.0 in train set" assert test_output["num1"][2] == 3, \ "Imputed median value should be 3.0 in test set"