def test_logistic_regression_wrong_type(self): """ Negative test data: array (unsupported type) Checks that the function raises a TypeError if the data is passed as an array. """ # 1. Arrange data = [2, 4, np.nan, 1] # 2. Act & 3. Assert with self.assertRaises(TypeError): logistic_regression(data)
def test_logistic_regression_wrong_dependent(self): """ Negative test data: Correct data frame (df_breast_cancer) dependent: 'z' (not a column of df_breast_cancer) Checks that the function raises a ValueError if the column specified as the dependent variable doesn't exist in the data. """ # 1. Arrange df = generate_df_breast_cancer() # 2. Act & 3. Assert with self.assertRaises(ValueError): logistic_regression(df, 'z', ['thickness', 'uniformity'])
def test_logistic_regression_wrong_predictor(self): """ Negative test data: Correct data frame (df_breast_cancer) predictors: ['thickness', 'z'] ('z' is not a column of df_breast_cancer) Checks that the function raises a ValueError if one of the column s specified as the predictor variables doesn't exist in the data. """ # 1. Arrange df = generate_df_breast_cancer() # 2. Act & 3. Assert with self.assertRaises(ValueError): logistic_regression(df, 'class', ['thickness', 'z'])
def test_logistic_regression_wrong_regressions(self): """ Negative test data: Correct data frame (df_breast_cancer) regressions: 'z' (not a valid value) Checks that the function raises a ValueError if the value passed for the parameter regressions is not valid. """ # 1. Arrange df = generate_df_breast_cancer() # 2. Act & 3. Assert with self.assertRaises(ValueError): logistic_regression(df, 'class', ['thickness', 'uniformity'], regressions='z')
def test_logistic_regression_inplace(self): """ Positive test data: Correct data frame (df_breast_cancer) The data frame (df_breast_cancer) contains 15 NA values. logistic_regression() should impute 7 of them. Checks that the data frame contains 8 NA values after the operation. """ # 1. Arrange df = generate_df_breast_cancer() # 2. Act logistic_regression(df, 'class', ['thickness', 'uniformity'], inplace=True) # 3. Assert self.assertEqual(df.isna().sum().sum(), 8)
def mice_one_imputation(data): """Auxiliary function that performs one MICE imputation, choosing the order in which the columns are imputed at random. :param data: The data on which to perform the imputation. :type data: pandas.DataFrame :return: The dataframe with one MICE imputation performed. :rtype: pandas.DataFrame """ # This auxiliary function always returns a copy: res = data.copy() # Save the mask of missing values: na_mask = pd.isna(data) # Compute the list of columns with missing values columns_with_na = [] for column in data.columns: if data[column].isna().any(): columns_with_na.append(column) # Shuffle the list of columns to impute: shuffle(columns_with_na) # Impute with mean substitution: for column in columns_with_na: if is_numeric_dtype(data[column]): mean_substitution(res, columns=[column], inplace=True) else: random_sample_imputation(res, columns=[column], inplace=True) # Compute which columns are numeric in order to use them as predictors: numerics = [col for col in data.columns if is_numeric_dtype(data[col])] # Impute each column: for column in columns_with_na: if is_numeric_dtype(data[column]): res.loc[na_mask[column], column] = np.nan linear_regression(res, column, predictors=numerics, inplace=True) else: res.loc[na_mask[column], column] = None logistic_regression(res, column, inplace=True) return res
def test_logistic_regression_returning(self): """ Positive test data: Correct data frame (df_breast_cancer) The data frame (df_breast_cancer) contains 15 NA values. logistic_regression() should impute 7 of them. Checks that the original series remains unmodified and that the returned series contains 8 NA values. """ # 1. Arrange df = generate_df_breast_cancer() # 2. Act df2 = logistic_regression(df, 'class', ['thickness', 'uniformity']) # 3. Assert self.assertEqual(df.isna().sum().sum(), 15) self.assertEqual(df2.isna().sum().sum(), 8)