def test_knn_n_neighbors(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) statistics_mean = np.nanmean(X, axis=0) # Test with 1 neighbor X_imputed_1NN = np.array([ [0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13] ]) n_neighbors = 1 imputer = KNNImputer(n_neighbors=n_neighbors) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) # Test with 6 neighbors X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) X_imputed_6NN = np.array([ [0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13] ]) n_neighbors = 6 imputer = KNNImputer(n_neighbors=6) imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) assert_array_equal(imputer.statistics_, statistics_mean) assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( X).transform(X))
class KNN_impute: def __init__(self, missing_val_rep=0.0, k=10, copy=False): self.missing_val_rep = missing_val_rep self.imputer = KNNImputer(missing_val_rep, k, copy=copy, col_max_missing=1.0, row_max_missing=01.0) def add_medians(self, X, y): X['labels'] = y label_meds = remove_rows(X).groupby(by='labels').median() #print(label_meds) for l in tqdm(label_meds.index): X[X['labels'] == l] = X[X['labels'] == l].replace( self.missing_val_rep, label_meds.loc[l, :].to_dict()) X.drop(columns=['labels'], inplace=True) def fit(self, X, y): self.add_medians(X, y) print('INSIDE IMPUTER: Beginning the fit') self.imputer.fit(X) print('INSIDE IMPUTER: Completed the fit') return None ''' def add_median(df): medians = df.median(axis=0) return df.replace(self.missing_val_rep, medians.to_dict()) X['labels'] = y X_median = X.groupby(by='labels').apply(add_median) #print(X_median) X.drop(columns=['labels'],inplace=True) X_median.drop(columns=['labels'],inplace=True) self.imputer.fit(X_median) ''' def transform(self, X): return self.imputer.transform(X)
class StandardImputation(BaseEstimator, DataPreprocessorMixin): """Standard imputation method for static data. Reference 1: https://pypi.org/project/missingpy/ Reference 2: https://s3.amazonaws.com/assets.datacamp.com/production/course_17404/slides/chapter4.pdf Attributes: - imputation_model_name: 'mice', 'missforest', 'knn' - data_type: 'static' """ def __init__(self, imputation_model_name, data_type): # Only allow for certain options assert data_type == 'static' assert imputation_model_name in ['mice', 'missforest', 'knn'] self.imputation_model_name = imputation_model_name self.data_type = data_type # Initialize the imputation model self.imputation_model = None def fit(self, dataset): """Train standard imputation model. Args: - dataset: incomplete dataset """ if dataset.static_feature is not None: # MICE if self.imputation_model_name == 'mice': self.imputation_model = IterativeImputer() # MissForest elif self.imputation_model_name == 'missforest': self.imputation_model = MissForest() # KNN elif self.imputation_model_name == 'knn': self.imputation_model = KNNImputer() self.imputation_model.fit(dataset.static_feature) return def transform(self, dataset): """Return imputed dataset by standard imputation. Args: - dataset: incomplete dataset Returns: - dataset: imputed dataset by standard imputation. """ assert self.imputation_model is not None if dataset.static_feature is not None: # Standard imputation data_imputed = self.imputation_model.transform( dataset.static_feature) # Rounding dataset.static_feature = rounding(dataset.static_feature, data_imputed) return dataset def fit_transform(self, dataset): """Fit and transform. Return imputed data Args: - dataset: incomplete dataset """ self.fit(dataset) return self.transform(dataset)
def test_knn_imputation_default(): # Test imputation with default parameter values # Test with an imputable matrix X = np.array([ [1, 0, 0, 1], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [np.nan, 4, 5, 5], [6, np.nan, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 8], [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = KNNImputer() assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) # Test with % missing in row > row_max_missing X = np.array([ [1, 0, 0, 1], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [np.nan, 4, 5, 5], [6, np.nan, 6, 7], [8, 8, 8, 8], [19, 19, 19, 19], [np.nan, np.nan, np.nan, 19], ]) statistics_mean = np.nanmean(X, axis=0) r7c0, r7c1, r7c2, _ = statistics_mean X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 8], [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], [19, 19, 19, 19], [r7c0, r7c1, r7c2, 19], ]) imputer = KNNImputer() assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6) # Test with all neighboring donors also having missing feature values X = np.array([ [1, 0, 0, np.nan], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [4, 4, 5, np.nan], [6, 7, 6, np.nan], [8, 8, 8, np.nan], [20, 20, 20, 20], [22, 22, 22, 22] ]) statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [1, 0, 0, 21], [2, 1, 2, 21], [3, 2, 3, 21], [4, 4, 5, 21], [6, 7, 6, 21], [8, 8, 8, 21], [20, 20, 20, 20], [22, 22, 22, 22] ]) imputer = KNNImputer() assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) # Test when data in fit() and transform() are different X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16] ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [1, 0], [3, 2], [4, np.nan] ]) Y_imputed = np.array([ [1, 0], [3, 2], [4, 4.8] ]) imputer = KNNImputer() assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) assert_array_equal(imputer.statistics_, statistics_mean)
class Imputer(object): """Module for feature imputation.""" def __init__(self, missing_values='nan', strategy='mean', n_neighbors=5): ''' Imputation of feature values using either sklearn, missingpy or (WIP) fancyimpute approaches. Parameters ---------- missing_values : number, string, np.nan (default) or None The placeholder for the missing values. All occurrences of `missing_values` will be imputed. strategy : string, optional (default="mean") The imputation strategy. Supported using sklearn: - If "mean", then replace missing values using the mean along each column. Can only be used with numeric data. - If "median", then replace missing values using the median along each column. Can only be used with numeric data. - If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. - If "constant", then replace missing values with fill_value. Can be used with strings or numeric data. Supported using missingpy: - If 'knn', then use a nearest neighbor search. Can be used with strings or numeric data. WIP: More strategies using fancyimpute n_neighbors : int, optional (default = 5) Number of neighboring samples to use for imputation if method is knn. ''' # Set parameters to objects self.missing_values = missing_values self.strategy = strategy self.n_neighbors = n_neighbors # Depending on the imputations strategy, use a specific toolbox if strategy in ['mean', 'median', 'most_frequent', 'constant']: self.Imputer =\ SimpleImputer(missing_values=self.missing_values, strategy=self.strategy) elif strategy == 'knn': if missing_values == 'nan': # Slightly different API for missingpy self.missing_values = 'NaN' self.Imputer = KNNImputer(missing_values=self.missing_values, n_neighbors=self.n_neighbors) def fit(self, X, y=None): self.Imputer.fit(X, y) def transform(self, X): return self.Imputer.transform(X)