def test_knn_imputation_zero_p2(): # Test with an imputable matrix and also compare with missing_values="NaN" X_zero = np.array([ [1, 0, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 0], [6, 6, 0, 6, 6], ]) X_nan = np.array([ [1, np.nan, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, np.nan], [6, 6, np.nan, 6, 6], ]) statistics_mean = np.nanmean(X_nan, axis=0) X_imputed = np.array([ [1, 2.5, 1, 1, 1.], [2, 2, 2, 2, 2], [3, 3, 3, 3, 1.5], [6, 6, 2.5, 6, 6], ]) imputer_zero = KNNImputer(missing_values=0, n_neighbors=2, weights="uniform") imputer_nan = KNNImputer(missing_values="NaN", n_neighbors=2, weights="uniform") assert_array_equal(imputer_zero.fit_transform(X_zero), X_imputed) assert_array_equal(imputer_zero.statistics_, statistics_mean) assert_array_equal(imputer_zero.fit_transform(X_zero), imputer_nan.fit_transform(X_nan))
def test_weight_uniform(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with "uniform" weight (or unweighted) X_imputed_uniform = np.array([ [0, 0], [5, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) imputer = KNNImputer(weights="uniform") assert_array_equal(imputer.fit_transform(X), X_imputed_uniform) # Test with "callable" weight def no_weight(dist=None): return None imputer = KNNImputer(weights=no_weight) assert_array_equal(imputer.fit_transform(X), X_imputed_uniform)
def impute_values(df: pd.DataFrame, method: str = 'mean', **kwargs): """ Impute missing values in DataFrame (np.nan or None). ------------------------ Args: * df: pd.DataFrame of (samples x features) * method: string for what method of imputation to use ** 'mean': mean imputation ** 'knn': K-NN imputation (see missingpy.KNNImputer) ** 'rf': random forest imputation (see missingpy.MissForest) Returns: * pd.DataFrame: imputed values (samples x features) """ assert method in ('mean','knn','rf'), '{} not yet implemented.'.format(method) if method=='mean': return df.fillna(df.mean(0)) elif method=='knn': X = df.values imputer = KNNImputer(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns) elif method=='rf': X = df.values imputer = MissForest(**kwargs) X_impute = imputer.fit_transform(X) return pd.DataFrame(X_impute, index=df.index, columns=df.columns)
def test_knn_n_neighbors(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) statistics_mean = np.nanmean(X, axis=0) # Test with 1 neighbor X_imputed_1NN = np.array([ [0, 0], [4, 2], [4, 3], [5, 3], [7, 7], [7, 8], [14, 13] ]) n_neighbors = 1 imputer = KNNImputer(n_neighbors=n_neighbors) assert_array_equal(imputer.fit_transform(X), X_imputed_1NN) assert_array_equal(imputer.statistics_, statistics_mean) # Test with 6 neighbors X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, np.nan], [7, 7], [np.nan, 8], [14, 13] ]) X_imputed_6NN = np.array([ [0, 0], [6, 2], [4, 3], [5, 5.5], [7, 7], [6, 8], [14, 13] ]) n_neighbors = 6 imputer = KNNImputer(n_neighbors=6) imputer_plus1 = KNNImputer(n_neighbors=n_neighbors + 1) assert_array_equal(imputer.fit_transform(X), X_imputed_6NN) assert_array_equal(imputer.statistics_, statistics_mean) assert_array_equal(imputer.fit_transform(X), imputer_plus1.fit( X).transform(X))
def test_callable_metric(): # Define callable metric that returns the l1 norm: def custom_callable(x, y, missing_values="NaN", squared=False): x = np.ma.array(x, mask=np.isnan(x)) y = np.ma.array(y, mask=np.isnan(y)) dist = np.nansum(np.abs(x-y)) return dist X = np.array([ [4, 3, 3, np.nan], [6, 9, 6, 9], [4, 8, 6, 9], [np.nan, 9, 11, 10.] ]) X_imputed = np.array([ [4, 3, 3, 9], [6, 9, 6, 9], [4, 8, 6, 9], [5, 9, 11, 10.] ]) imputer = KNNImputer(n_neighbors=2, metric=custom_callable) assert_array_equal(imputer.fit_transform(X), X_imputed)
def test_complete_features(): # Test with use_complete=True X = np.array([ [0, np.nan, 0, np.nan], [1, 1, 1, np.nan], [2, 2, np.nan, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [np.nan, 7, 7, 7] ]) r0c1 = np.mean(X[1:6, 1]) r0c3 = np.mean(X[2:-1, -1]) r1c3 = np.mean(X[2:-1, -1]) r2c2 = np.nanmean(X[:6, 2]) r7c0 = np.mean(X[2:-1, 0]) X_imputed = np.array([ [0, r0c1, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7] ]) imputer_comp = KNNImputer() assert_array_almost_equal(imputer_comp.fit_transform(X), X_imputed)
def impute_times(final, times_open, times_closed, columns, imputation_method="mean"): """ Impute open work items times with different methods :param final: Complete preprocessed dataframe :param times_open: Dataframe of work items that are not closed :param times_closed: Dataframe of work items that are closed :param columns: Columns to impute :param imputation_method: Choose between 'mean', 'KNN', 'forest' :return: Dataframe of open work items with imputed values """ if imputation_method == "mean": for col in columns: mean = times_closed[col].mean() mask = (times_open[col] == 0) times_open[col].mask(mask, mean, inplace=True) if imputation_method in ["KNN", "forest"]: if imputation_method == "KNN": imputer = KNNImputer(missing_values=0, col_max_missing=0.9) if imputation_method == "forest": imputer = MissForest(missing_values=0) for col in columns: try: val = imputer.fit_transform(pd.DataFrame(final[col]))[:, 0] other = pd.DataFrame(index=final.index, data=val, columns=[col]) mask = (times_open[col] == 0) times_open.loc[mask, col] = other except ValueError: imputer = KNNImputer(missing_values=0, col_max_missing=0.99) return times_open
def pre_processing(data_route): data_frame = pd.read_csv(data_route) #Missing Value Imputation by Random Forest real_colums = data_frame.columns def handle_column_negative(x): return x.map(lambda x: x * (-1) if x < 0 else x) numericData = data_frame.copy() # Preparing data to Random Forest numericData = numericData.drop(["cluster", "date", "country"], axis=1) numericData = numericData.apply(lambda x: handle_column_negative(x), axis=1) numericData = numericData.replace([np.inf, -np.inf], np.nan) # applying random forest random_forest_imputer = KNNImputer() random_forest_result = random_forest_imputer.fit_transform(numericData) data_frame_processed = pd.DataFrame(random_forest_result) # adding removed fields data_frame_processed.insert(0, column='date', value=data_frame['date']) data_frame_processed.insert(0, column='cluster', value=data_frame['cluster']) data_frame_processed.insert(0, column='country', value=data_frame['country']) data_frame_processed.columns = real_colums return data_frame_processed
def imputate_using_knn(dataset, k): cols = dataset.columns knn_impu = KNNImputer(n_neighbors=k, weights="uniform") result = knn_impu.fit_transform(dataset) result = pd.DataFrame(result) result.columns = cols return result
def knn_impute(data, n_neighbors=3): imputer = KNNImputer(n_neighbors=n_neighbors, missing_values=np.nan, weights='distance') imputed_df = pd.DataFrame(imputer.fit_transform(data)) imputed_df.columns = data.columns return (imputed_df)
def outlier_treatment(train_data_frame): numericData = train_data_frame.loc[:, "expenses":"volume"] cleaned_data = numericData.copy() cleaned_data[~(np.abs(stats.zscore(cleaned_data)) < 3).all( axis=1)] = np.nan imputer = KNNImputer() result = imputer.fit_transform(cleaned_data) cdp = pd.DataFrame(result) cdp.insert(0, column='date', value=train_data_frame['date']) cdp.insert(0, column='cluster', value=train_data_frame['cluster']) cdp.insert(0, column='country', value=train_data_frame['country']) cdp.columns = train_data_frame.columns.copy() return cdp
def test_knn_imputation_shape(): # Verify the shapes of the imputed matrix for different weights and # number of neighbors. n_rows = 10 n_cols = 2 X = np.random.rand(n_rows, n_cols) X[0, 0] = np.nan for weights in ['uniform', 'distance']: for n_neighbors in range(1, 6): imputer = KNNImputer(n_neighbors=n_neighbors, weights=weights) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (n_rows, n_cols))
def do_impute(self, matrix_to_impute): parameter_set = self.get_parameter_set() np.savetxt('test_cur_matrix_missing.csv', matrix_to_impute) if self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_randomforest: imputed_cur_matrix = np.transpose( self.rfimpute.miss_forest_imputation( np.transpose(matrix_to_impute))) elif self.parameters.impute_mode == parameter_set.constants.v_unsupervised_parameters_impute_mode_knn: imputer = KNNImputer(n_neighbors=2, row_max_missing=1, col_max_missing=1) imputed_cur_matrix = np.transpose( imputer.fit_transform(np.transpose(matrix_to_impute))) return imputed_cur_matrix
def test_complete_features_weighted(): # Test with use_complete=True X = np.array([ [0, 0, 0, np.nan], [1, 1, 1, np.nan], [2, 2, np.nan, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [np.nan, 7, 7, 7] ]) dist = pairwise_distances(X, metric="masked_euclidean", squared=False) # Calculate weights r0c3_w = 1.0 / dist[0, 2:-1] r1c3_w = 1.0 / dist[1, 2:-1] r2c2_w = 1.0 / dist[2, (0, 1, 3, 4, 5)] r7c0_w = 1.0 / dist[7, 2:7] # Calculate weighted averages r0c3 = np.average(X[2:-1, -1], weights=r0c3_w) r1c3 = np.average(X[2:-1, -1], weights=r1c3_w) r2c2 = np.average(X[(0, 1, 3, 4, 5), 2], weights=r2c2_w) r7c0 = np.average(X[2:7, 0], weights=r7c0_w) X_imputed = np.array([ [0, 0, 0, r0c3], [1, 1, 1, r1c3], [2, 2, r2c2, 2], [3, 3, 3, 3], [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6], [r7c0, 7, 7, 7] ]) imputer_comp_wt = KNNImputer(weights="distance") assert_array_almost_equal(imputer_comp_wt.fit_transform(X), X_imputed)
def impute_missing_for_dataframe(dataframe, target='job_performance'): """ The imputer function should be used on a dataframe that has already been numerically encoded """ from missingpy import KNNImputer #, MissForest X = dataframe.loc[:, dataframe.columns != target].values y = dataframe[target].values # imputer object knn = KNNImputer(n_neighbors=5, weights="uniform", metric="masked_euclidean", row_max_missing=0.8, col_max_missing=0.8, copy=True) knn_missing_imputation = knn.fit_transform(X) imputed_dataframe = pd.DataFrame(knn_missing_imputation, columns = dataframe.columns[dataframe.columns != target]) imputed_dataframe[target] = pd.Series(y) return imputed_dataframe
def Impute_Data_KNN(X_train, y_train, X_test, y_test, vals_mask, cols, data, var, min_vals, max_vals): XY_incomplete_train = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1) XY_incomplete_test = np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1) imputer = KNNImputer(n_neighbors=5) XY_completed_train = imputer.fit_transform(XY_incomplete_train) XY_completed_test = imputer.transform(XY_incomplete_test) X_train_imp = (XY_completed_train[:, 0:data.shape[1]]) y_train_imp_orig = np.array(XY_completed_train[:, data.shape[1]], dtype="int16") y_train_imp = np.array(XY_completed_train[:, data.shape[1]] >= 5, dtype="int16") X_test_imp = (XY_completed_test[:, 0:data.shape[1]]) y_test_imp = np.array(XY_completed_test[:, data.shape[1]] >= 5, dtype="int16") y_test_imp_orig = np.array(XY_completed_test[:, data.shape[1]], dtype="int16") for j in range(0, X_train_imp.shape[1]): if var.iloc[j]['type'] == 'cat': X_train_imp[:, j] = np.clip(np.round(X_train_imp[:, j]), min_vals[j], max_vals[j]) X_test_imp[:, j] = np.clip(np.round(X_test_imp[:, j]), min_vals[j], max_vals[j]) else: X_train_imp[:, j] = np.round(X_train_imp[:, j], decimals=1) X_test_imp[:, j] = np.round(X_test_imp[:, j], decimals=1) #min_vals_imp=np.nanmin(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) #max_vals_imp=np.nanmax(np.concatenate((X_train_imp,X_test_imp),axis=0),axis=0) return (X_train_imp, y_train_imp, X_test_imp, y_test_imp, y_train_imp_orig, y_test_imp_orig)
def create_2d_velocity_field(self, radii, v_rot, n_interp_r=150, n_interp_theta=150): ''' uses tilted ring model parameters to calculate velocity field using eqn 1-3 of 1709.02049 and v_rot from mass model it is easier to loop through polar coordinates and then map the v_los to the nearest x,y point returns 2d velocity field array ''' v_field = np.empty(shape=(self.image_ydim, self.image_xdim)) v_field[:] = np.nan v_rot_interp = interp1d(radii, v_rot) radii_interp = np.linspace(np.min(radii), np.max(radii), n_interp_r) for r in radii_interp: v = v_rot_interp(r) for theta in np.linspace(0, 2. * np.pi, n_interp_theta): x, y, v_los = self._calc_v_los_at_r_theta(v, r, theta) if (self.image_xdim - 1 > x > 0 and y < self.image_ydim - 1 and y > 0): arr_x, arr_y = int(np.round(x, 0)), int(np.round(y, 0)) try: v_field[arr_y][arr_x] = v_los except: print(arr_x, arr_y, v_los) near_neighbors_mask = create_blurred_mask(v_field) imputer = KNNImputer(n_neighbors=3, weights="distance") v_field = imputer.fit_transform( np.where(near_neighbors_mask == 1, v_field, 0.)) v_field[v_field == 0] = np.nan # rotate to match the fits data field v_field = np.rot90(v_field, 3) return v_field
def clean_dragon(save=False): source = os.path.join(DATA_DIR, "cids-smiles-dragon.txt") df = pd.read_csv(source).set_index("CID") df = df.iloc[:, 1:] # Drop SMILES column # Scale to mean 0, variance 1 ss = StandardScaler() good = df.columns[df.isnull().sum() < 500] df = df[good] scaled = ss.fit_transform(df.astype("float")) df = pd.DataFrame(scaled, index=df.index, columns=df.columns) # Impute missing values knn = KNNImputer(k=5) imputed = knn.fit_transform(df.values) df = pd.DataFrame(imputed, index=df.index, columns=df.columns) # Optionally save to disk if save: dest = os.path.join(DATA_DIR, "cids-smiles-dragon-scaled-imputed.txt") df.to_csv(dest) return df
Mask[index1, index2] = 0 Missing = Image.fromarray(rgbArray) plt.imshow(Missing) plt.show() return out SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1) #plt.show()
) # taking the target values in one dataframe from all dataset dt = data[data.columns[len(data.columns) - 1]] target = dt # the data without target values data = data[data.columns[:64]] # standard scalar(z-score) used for feature selection scaler = StandardScaler() scaled_df = scaler.fit_transform(data) # for knn imputation imputer = KNNImputer(n_neighbors=5, weights="distance") imputed_data = imputer.fit_transform(scaled_df) # used stratified k fold cross validation skf = StratifiedKFold(n_splits=k_fold) # SVM print("SVM") svm_acc = 0 svm_spe = 0 svm_sen = 0 # k fold cross validation loop for train, test in skf.split(imputed_data, target): # divide dataser in training and testing
def predict_age(self, exprdata, genelength=None, chronage=None): """Calculate RNA age. This function calculates RNA age based on pre-trained predictors. :param exprdata: a pandas DataFrame which contains gene expression data with each row represents a gene and each column represents a sample. Use the argument "exprtype" to specify raw count or FPKM. The index of "exprdata" should be gene ids and columns names of "exprdata" should be sample ids. :param genelength: a pandas Series, DataFrame, numpy array, or list which contains gene length in bp. The size of genelength should be equal to the number of rows in exprdata. This argument is optional. If using exprtype="FPKM", genelength argument is ignored. If using exprtype="count", the raw count will be converted to FPKM. If genelength is provided, the function will convert raw count to FPKM according to the user-supplied gene length. Otherwise, gene length is obtained from the internal database. :param chronage: a pandas DataFrame which contains the chronological age of each sample. This argument is optional. If provided, it should be a DataFrame with 1st column sample id and 2nd column chronological age. The sample order in chronage doesn't have to be in the same order as in exprdata. However, the samples in chronage and exprdata should be the same. If some samples' chronological age are not available, users are expected to set the chronological age in chronage to NaN. If chronage contains more than 2 columns, only the first 2 columns will be considered. If this argument is not provided, the age acceleration residual will not be calculated. See package tutorial for the definition of age acceleration residual. :return: a pandas DataFrame contains RNA age. """ # check input: assert isinstance(exprdata, pd.DataFrame), \ "exprdata should be a pandas DataFrame." assert exprdata.applymap(np.isreal).all().all(),\ "Only numeric values are allowed in the exprdata DataFrame." assert list(exprdata.index) != list(range(exprdata.shape[0])), \ "The index of exprdata should be gene ids." assert list(exprdata.columns) != list(range(exprdata.shape[1])), \ "The column names of exprdata should be sample ids." assert ~np.any(exprdata.index.duplicated()), \ "Duplicated gene names found in exprdata." assert (exprdata >= 0).all().all(), \ "Gene expression data cannot contain negative value(s)." if chronage is not None: assert isinstance(chronage, pd.DataFrame), \ "chronage should be a pandas DataFrame." if (chronage.shape[1] > 2): print("More than 2 columns are provided in chronage. " "Only the first 2 columns will be used.") # assert ~chronage.applymap(np.isreal).all()[0], \ # "The 1st column in chronage should be sample ids." assert chronage.applymap(np.isreal).all()[1], \ "The 2nd column in chronage should be chronological age." assert ~any(chronage.iloc[:, 0].duplicated()), \ "chronage contains duplicated sample ids." assert set(chronage.iloc[:, 0].astype(str)) == \ set(exprdata.columns), \ "Samples in chronage and exprdata should be the same." assert ~np.any(chronage.iloc[:, 1] < 0), \ "Chronological age contains negative value(s)." if self._exprtype == "count": exprdata = self._count2FPKM(exprdata, genelength) if self.idtype != "symbol": mg = mygene.MyGeneInfo() genes = list(exprdata.index) temp = mg.querymany(genes, scopes=self.idtype, fields='symbol', species='human', returnall=True, as_dataframe=True)["out"] temp = temp.loc[~temp["symbol"].isna(), "symbol"] temp = temp[~temp.index.duplicated(keep="first")] temp = temp.drop_duplicates(keep=False) genesymbol = temp[exprdata.index] genesymbol[genesymbol.isna()] = "unknown" exprdata.index = genesymbol location = os.path.dirname(os.path.realpath(__file__)) if self.stype == "all": tempPath = os.path.join( location, "internal_data", "all", "coef_{}_{}.csv".format(self._tissue, self._signature)) else: tempPath = os.path.join( location, "internal_data", "Caucasian", "coef_{}_{}.csv".format(self._tissue, self._signature)) sig_internal = pd.read_csv(tempPath, index_col=0) genes_required = sig_internal.index[1:] sig_in_expr = genes_required.isin(exprdata.index) # full NA row if np.sum(~sig_in_expr) != 0: print("{:.2f}% genes in the gene signature are not included in " "the supplied gene expression.".format( np.sum(~sig_in_expr) / len(genes_required) * 100)) # impute the gene expression in the log scale tempmat = pd.DataFrame(columns=exprdata.columns, index=genes_required[~sig_in_expr]) exprdata_withNA = pd.concat([exprdata, tempmat], axis=0) exprdata_log = np.log2(exprdata_withNA.apply(pd.to_numeric) + 1) ind1 = exprdata_log.isna().all(axis=1) ind2 = ~exprdata_log.isna().any(axis=1) exprdata_log.loc[(ind1 | ind2), :] = \ exprdata_log.loc[(ind1 | ind2), :].fillna(exprdata_log.mean()) else: exprdata_log = np.log2(exprdata.apply(pd.to_numeric) + 1) # check partial NA if ~exprdata_log.notnull().all().all(): # impute the gene expression in the log scale imputer = KNNImputer(n_neighbors=min(10, exprdata_log.shape[1]), row_max_missing=1, col_max_missing=1) X_imputed = imputer.fit_transform(exprdata_log.transpose()) exprdata_log_impute = pd.DataFrame(X_imputed).transpose() exprdata_log_impute.index = exprdata_log.index exprdata_sub = exprdata_log_impute.loc[genes_required, :] else: exprdata_sub = exprdata_log.loc[genes_required, :] RNAAge = exprdata_sub.apply(lambda x: np.sum( x.multiply(sig_internal.iloc[1:, 0])) + sig_internal.iloc[0, 0]) res = pd.DataFrame(index=exprdata.columns) res["RNAAge"] = list(RNAAge) if chronage is not None: chronage.index = chronage.iloc[:, 0] res["ChronAge"] = chronage.loc[res.index].iloc[:, 1] # if sample size is too small, age acceleration residual # cannot be calculated if res.dropna().shape[0] > 30: Y = res["RNAAge"] X = res["ChronAge"] X = sm.add_constant(X) model = sm.OLS(Y, X).fit() res["AgeAccelResid"] = model.resid return res
# In[6]: # Dummy variable with one hot-coded - Gender and Embarked import category_encoders as ce from sklearn.preprocessing import OneHotEncoder ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True) train = ohe.fit_transform(train) train = train.drop(['Sex_female'], axis=1) # In[7]: # Impute with KNN - Age from missingpy import KNNImputer imputer = KNNImputer() X_imputed = imputer.fit_transform(train) train_knn = pd.DataFrame(pd.DataFrame(X_imputed)) train_knn.columns = train.columns # In[8]: age_floor = list(map(lambda x: math.floor(x), train_knn.Age)) train_knn['Age'] = age_floor # ## model 1: Random Forest using train_knn (imputation of Age with knn) # In[9]:
def test_knn_imputation_default(): # Test imputation with default parameter values # Test with an imputable matrix X = np.array([ [1, 0, 0, 1], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [np.nan, 4, 5, 5], [6, np.nan, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 8], [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], [16, 15, 18, 19], ]) imputer = KNNImputer() assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) # Test with % missing in row > row_max_missing X = np.array([ [1, 0, 0, 1], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [np.nan, 4, 5, 5], [6, np.nan, 6, 7], [8, 8, 8, 8], [19, 19, 19, 19], [np.nan, np.nan, np.nan, 19], ]) statistics_mean = np.nanmean(X, axis=0) r7c0, r7c1, r7c2, _ = statistics_mean X_imputed = np.array([ [1, 0, 0, 1], [2, 1, 2, 8], [3, 2, 3, 8], [4, 4, 5, 5], [6, 3, 6, 7], [8, 8, 8, 8], [19, 19, 19, 19], [r7c0, r7c1, r7c2, 19], ]) imputer = KNNImputer() assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) assert_array_almost_equal(imputer.statistics_, statistics_mean, decimal=6) # Test with all neighboring donors also having missing feature values X = np.array([ [1, 0, 0, np.nan], [2, 1, 2, np.nan], [3, 2, 3, np.nan], [4, 4, 5, np.nan], [6, 7, 6, np.nan], [8, 8, 8, np.nan], [20, 20, 20, 20], [22, 22, 22, 22] ]) statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [1, 0, 0, 21], [2, 1, 2, 21], [3, 2, 3, 21], [4, 4, 5, 21], [6, 7, 6, 21], [8, 8, 8, 21], [20, 20, 20, 20], [22, 22, 22, 22] ]) imputer = KNNImputer() assert_array_equal(imputer.fit_transform(X), X_imputed) assert_array_equal(imputer.statistics_, statistics_mean) # Test when data in fit() and transform() are different X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 16] ]) statistics_mean = np.nanmean(X, axis=0) Y = np.array([ [1, 0], [3, 2], [4, np.nan] ]) Y_imputed = np.array([ [1, 0], [3, 2], [4, 4.8] ]) imputer = KNNImputer() assert_array_equal(imputer.fit(X).transform(Y), Y_imputed) assert_array_equal(imputer.statistics_, statistics_mean)
abs(reDataBool.loc[idx[0], idx[1]] ^ dataBool.loc[idx[0], idx[1]]) for idx in randIdxBool ]) #------------------------------------------------------------------------------ #-------------------------Nearest Neighbor Imputation-------------------------- #------------------------------------------------------------------------------ n_neighbors = 5 nan = np.nan #numerical data imputerNum = KNNImputer(missing_values=nan, n_neighbors=n_neighbors, weights="distance") impDataNum = imputerNum.fit_transform(dataNumVal) impDataNum = pd.DataFrame(impDataNum, columns=dataNumVal.columns) # residuals sum of squared errors for the imputed missing values: rssImpNum = sum([ (impDataNum.loc[idx[0], idx[1]] - dataNumNorm.loc[idx[0], idx[1]])**2 for idx in randIdxNum ]) # scale back to normal impDataMax = impDataNum.max(axis=0) impDataMin = impDataNum.min(axis=0) impScDataNum = (impDataNum - impDataMin) * (dataNumMax - dataNumMin) / ( impDataMax - impDataMin) + dataNumMin # insert imputed data in missing values
def test_weight_distance(): X = np.array([ [0, 0], [np.nan, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # Test with "distance" weight # Get distance of "n_neighbors" neighbors of row 1 dist_matrix = pairwise_distances(X, metric="masked_euclidean") index = np.argsort(dist_matrix)[1, 1:6] dist = dist_matrix[1, index] weights = 1 / dist values = X[index, 0] imputed = np.dot(values, weights) / np.sum(weights) # Manual calculation X_imputed_distance1 = np.array([ [0, 0], [3.850394, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) # NearestNeighbor calculation X_imputed_distance2 = np.array([ [0, 0], [imputed, 2], [4, 3], [5, 6], [7, 7], [9, 8], [11, 10] ]) imputer = KNNImputer(weights="distance") assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance1, decimal=6) assert_array_almost_equal(imputer.fit_transform(X), X_imputed_distance2, decimal=6) # Test with weights = "distance" and n_neighbors=2 X = np.array([ [np.nan, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) statistics_mean = np.nanmean(X, axis=0) X_imputed = np.array([ [2.3828, 0, 0], [2, 1, 2], [3, 2, 3], [4, 5, 5], ]) imputer = KNNImputer(n_neighbors=2, weights="distance") assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=4) assert_array_equal(imputer.statistics_, statistics_mean) # Test with varying missingness patterns X = np.array([ [1, 0, 0, 1], [0, np.nan, 1, np.nan], [1, 1, 1, np.nan], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) statistics_mean = np.nanmean(X, axis=0) # Get weights of donor neighbors dist = masked_euclidean_distances(X) r1c1_nbor_dists = dist[1, [0, 2, 3, 4, 5]] r1c3_nbor_dists = dist[1, [0, 3, 4, 5, 6]] r1c1_nbor_wt = (1/r1c1_nbor_dists) r1c3_nbor_wt = (1 / r1c3_nbor_dists) r2c3_nbor_dists = dist[2, [0, 3, 4, 5, 6]] r2c3_nbor_wt = 1/r2c3_nbor_dists # Collect donor values col1_donor_values = np.ma.masked_invalid(X[[0, 2, 3, 4, 5], 1]).copy() col3_donor_values = np.ma.masked_invalid(X[[0, 3, 4, 5, 6], 3]).copy() # Final imputed values r1c1_imp = np.ma.average(col1_donor_values, weights=r1c1_nbor_wt) r1c3_imp = np.ma.average(col3_donor_values, weights=r1c3_nbor_wt) r2c3_imp = np.ma.average(col3_donor_values, weights=r2c3_nbor_wt) print(r1c1_imp, r1c3_imp, r2c3_imp) X_imputed = np.array([ [1, 0, 0, 1], [0, r1c1_imp, 1, r1c3_imp], [1, 1, 1, r2c3_imp], [0, 1, 0, 0], [0, 0, 0, 0], [1, 0, 1, 1], [10, 10, 10, 10], ]) imputer = KNNImputer(weights="distance") assert_array_almost_equal(imputer.fit_transform(X), X_imputed, decimal=6) assert_array_equal(imputer.statistics_, statistics_mean)
from missingpy import KNNImputer import pandas as pd df = pd.read_csv("원본_nan.csv") # print(df) # imputer = KNNImputer(n_neighbors=2,weights="uniform") ##최고성능 imputer = KNNImputer(n_neighbors=2, weights="uniform", col_max_missing=0.9, row_max_missing=0.9) X_imputed = imputer.fit_transform(df) print(X_imputed) # df2=pd.DataFrame(X_imputed, columns=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']) # print(df2['Insulin']) # df2.to_csv('test_imputed.csv', index=False, encoding='cp949', mode='w') #https://github.com/epsilon-machine/missingpy
#%% print(df['Embarked'].value_counts()) print(df['Sex'].value_counts()) #%% #droping nulls from embarked alone df.dropna(subset=['Embarked'], axis=0, inplace=True) #%% print(df.shape) #%% from missingpy import KNNImputer imputer = KNNImputer(n_neighbors=2, weights="uniform") X_imputed = imputer.fit_transform(df) df['Age_imputed'] = X_imputed[:, 3].round() print(df[df['Age'].isnull()]) #%% df.drop(['Age'], axis=1, inplace=True) sns.boxplot(df['Age_imputed']) plt.show() #%% print(df['Age_imputed'].describe()) #%% #age_imputed to bins def func3(x): if x < 21: return 0
def imputeKNN(data, **kwargs): imputer = KNNImputer(**kwargs) imputedData = imputer.fit_transform(data) imputedData = pd.DataFrame(imputedData, index=data.index, columns=data.columns) return imputedData
def imputeMatrix(dataM): imputer = KNNImputer(n_neighbors=10) dataT = imputer.fit_transform(dataM) return dataT
df[basement_details] = df[basement_details].fillna('NoBsmt') print( 'As the number of missing data for the following variables are low, we will just be dropping the observations that have missing data for these variables.' ) df = df.dropna(how='any', subset=['MasVnrType', 'MasVnrArea', 'Electrical']) print( 'For the variable LotFrontage, we we will be using K-Nearest Neighbours to impute the missing data.' ) from missingpy import KNNImputer imputer = KNNImputer(n_neighbors=5, weights='distance', metric='masked_euclidean') df.LotFrontage = imputer.fit_transform( np.array(df.drop('FireplaceQu', axis=1).LotFrontage).reshape(-1, 1)) df.FireplaceQu = df.FireplaceQu.fillna('NoFireplc') print('Checking for any more columns with missing data ...') missing_pct(df) ### Checking Data Types ### print('Changing numeric data to categorical ...') df = df.replace({ 'MSSubClass': { 20: "SC20", 30: "SC30", 40: "SC40", 45: "SC45", 50: "SC50",