def fit(self, X, y=None): assert isinstance(X, pd.DataFrame) start = X y_present = y is not None groupby_present = self.groupby is not None self.imputer = [] if y_present or groupby_present: assert not (groupby_present and y_present) if y_present: classes = np.unique(y) gen_mask = lambda c: y == c if groupby_present: classes = X[self.groupby].unique() gen_mask = lambda c: X[self.groupby] == c self.imputer = { c: { "impute": SoftImpute(max_iters=self.max_iters, **self.kwargs), "mask": gen_mask(c), } for c in classes } msg = """Building Soft Imputation Transformers for {} classes""".format( len(classes)) logger.info(msg) else: self.imputer = SoftImpute(max_iters=self.max_iters, **self.kwargs) msg = """Building Soft Imputation Transformer""" logger.info(msg) return self
def test_soft_impute_with_low_rank_random_matrix(): solver = SoftImpute() XY_completed = solver.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="SoftImpute") assert missing_mae < 0.1, "Error too high!"
def __init__(self): """ Params: k number of nearest neighbors to consider """ self._imputer = SoftImpute()
def softimpute_used(X, X_incomplete, missing_mask, count_miss): softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) """ softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean() softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss) print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse) print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse) """ return X_filled_softimpute_no_biscale
def fun(lambd_val): # fit soft impute for each lambda value si = SoftImpute(shrinkage_value=lambd_val, init_fill_method='mean', max_rank=max_rank, verbose=verbose, max_iters=max_iters, convergence_threshold=convergence_threshold) X_filled = si.fit_transform(X_incomplete.copy()) return ((X_filled[missing_mask] - X[missing_mask]) ** 2).mean()
def softimpute_used_for_cv(X, X_incomplete, missing_mask, count_miss, defined_missing_percent, limit1, limit2, percentile): softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) """ softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean() softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss) print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse) print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse) """ rmse_percentile = defaultdict(float) y = X[missing_mask] y_predict = X_filled_softimpute_no_biscale[missing_mask] y_percentile = defaultdict(list) y_predict_percentile = defaultdict(list) y_percentile_arr = defaultdict() y_predict_percentile_arr = defaultdict() for m, n in zip(y, y_predict): if m < percentile[10] and m > percentile[10] * (-1): y_percentile[10].append(m) y_predict_percentile[10].append(n) y_percentile_arr[10] = np.asarray(y_percentile[10]) y_predict_percentile_arr[10] = np.asarray(y_predict_percentile[10]) rmse_percentile[10] = np.sqrt( float( ((y_predict_percentile_arr[10] - y_percentile_arr[10])**2).sum()) / len(y_predict_percentile_arr[10])) for m, n in zip(y, y_predict): if abs(m) < percentile[5] and abs(m) > percentile[10]: y_percentile[5].append(m) y_predict_percentile[5].append(n) y_percentile_arr[5] = np.asarray(y_percentile[5]) y_predict_percentile_arr[5] = np.asarray(y_predict_percentile[5]) rmse_percentile[5] = np.sqrt( float(((y_predict_percentile_arr[5] - y_percentile_arr[5])**2).sum()) / len(y_predict_percentile_arr[5])) for m, n in zip(y, y_predict): if abs(m) < percentile[2] and abs(m) > percentile[5]: y_percentile[2].append(m) y_predict_percentile[2].append(n) y_percentile_arr[2] = np.asarray(y_percentile[2]) y_predict_percentile_arr[2] = np.asarray(y_predict_percentile[2]) rmse_percentile[2] = np.sqrt( float(((y_predict_percentile_arr[2] - y_percentile_arr[2])**2).sum()) / len(y_predict_percentile_arr[2])) return (X_filled_softimpute_no_biscale, rmse_percentile)
def Initialize_X_incomplete(X_incomplete, test_filename, train_filename): m, n = X_incomplete.shape missing_mask = np.zeros((m, n), dtype=bool) softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X = softImpute.complete(X_incomplete) count_miss = 0 for i in range(m): for j in range(n): if np.isnan(X_incomplete[i, j]): missing_mask[i, j] = True count_miss += 1 return (X, missing_mask, count_miss)
def forward(ctx, input): batch_num, c, h, w = input.size() output = torch.zeros_like(input).cpu().numpy() for i in range(batch_num): img = (input[i] * 2 - 1).cpu().numpy() if args.me_channel == 'concat': img = np.concatenate((np.concatenate( (img[0], img[1]), axis=1), img[2]), axis=1) if globe_train: mask = np.random.binomial( 1, args.startp + mask_train_cnt * (args.endp - args.startp) / args.mask_num, h * w * c).reshape(h, w * c).astype(float) else: mask = np.random.binomial( 1, random.uniform(args.startp, args.endp), h * w * c).reshape(h, w * c).astype(float) mask[mask < 1] = np.nan W = SoftImpute(verbose=False).fit_transform(mask * img) W[W < -1] = -1 W[W > 1] = 1 est_matrix = (W + 1) / 2 for channel in range(c): output[i, channel] = est_matrix[:, channel * h:(channel + 1) * h] else: if globe_train: mask = np.random.binomial( 1, args.startp + mask_train_cnt * (args.endp - args.startp) / args.mask_num, h * w).reshape(h, w).astype(float) else: mask = np.random.binomial( 1, random.uniform(args.startp, args.endp), h * w).reshape(h, w).astype(float) mask[mask < 1] = np.nan for channel in range(c): mask_img = img[channel] * mask W = SoftImpute(verbose=False).fit_transform(mask_img) W[W < -1] = -1 W[W > 1] = 1 output[i, channel] = (W + 1) / 2 output = output - mean output /= std output = torch.from_numpy(output).float().to(device) return output
def preprocessingData_pialadunia2018(dataset): # solve missing value dataset.iloc[:, 2:] = SoftImpute().complete(dataset.iloc[:, 2:]) dataset_independent = dataset.round({ 'HTRF': 0, 'THFP': 0, 'PHPAR': 0, 'PHSPAR': 0, 'PHPSAR': 0, 'PHDAR': 0, 'PHTSA': 0, 'PHDBAR': 0, 'PHAAR': 0, 'ATRF': 0, 'TAFP': 0, 'PAPAR': 0, 'PASPAR': 0, 'PAPSAR': 0, 'PADAR': 0, 'PATSA': 0, 'PADBAR': 0, 'PAAAR': 0 }) dataset_independent = dataset_independent.drop('Hasil', axis=1) #label encoder dataset_dependent = dataset['Hasil'] return dataset_independent, dataset_dependent
def preprocessingData(dataset): # solve missing value dataset.iloc[:, 5:] = SoftImpute().complete(dataset.iloc[:, 5:]) dataset_independent = dataset.round({ 'HTRF': 0, 'THFP': 0, 'PHPAR': 0, 'PHSPAR': 0, 'PHPSAR': 0, 'PHDAR': 0, 'PHTSA': 0, 'PHDBAR': 0, 'PHAAR': 0, 'ATRF': 0, 'TAFP': 0, 'PAPAR': 0, 'PASPAR': 0, 'PAPSAR': 0, 'PADAR': 0, 'PATSA': 0, 'PADBAR': 0, 'PAAAR': 0 }) dataset_independent = dataset_independent.drop('Hasil', axis=1) #label encoder dataset_dependent = dataset.iloc[:, [4]].values labelencoder_X = LabelEncoder() dataset_dependent = labelencoder_X.fit_transform(dataset_dependent) dataset_dependent_baru = pd.DataFrame(dataset_dependent, columns=['Hasil']) return dataset_independent, dataset_dependent_baru
def cmd(in_mat_file, dims, suffix, i_loo, j_loo, loo_output, loo_only, verbose, seed): """Read M_partial from IN_MAT_FILE and complete the matrix using soft-impute method.""" M = io.loadmat(in_mat_file)['M_partial'] rank = dims LOO_mode = False if i_loo > -1 and j_loo > -1: LOO = M[i_loo, j_loo] M[i_loo, j_loo] = 0 LOO_mode = True num_comments, num_voters = M.shape M[M == 0] = np.nan M_complete = SoftImpute(max_rank=dims).complete(M) if LOO_mode: file_tmpl = f'{in_mat_file}.r{rank}.s{seed}.i{i_loo}.j{j_loo}.soft-impute.out' if not loo_only: op_mat_file = file_tmpl + '.mat' io.savemat(op_mat_file, {'Mhat': M_complete}) op_loo_file = loo_output if loo_output is not None else file_tmpl + '.loo' loo_pred = M_complete[i_loo, j_loo] with open(op_loo_file, 'wt') as f: f.write('{}, {}'.format(LOO, loo_pred)) else: raise NotImplementedError('Use randomized_svd here.') # np.savetxt(in_mat_file + '.' + suffix + '.c_vecs', U) # np.savetxt(in_mat_file + '.' + suffix + '.v_vecs', V) print('Done at', datetime.now())
def baseline_inpute(X_incomplete, method='mean', level=0): if method == 'mean': X_filled_mean = SimpleFill().fit_transform(X_incomplete) return X_filled_mean elif method == 'knn': k = [3, 10, 50][level] X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete) return X_filled_knn elif method == 'svd': rank = [ np.ceil((X_incomplete.shape[1] - 1) / 10), np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1 ][level] X_filled_svd = IterativeSVD(rank=int(rank), verbose=False).fit_transform(X_incomplete) return X_filled_svd elif method == 'mice': max_iter = [3, 10, 50][level] X_filled_mice = IterativeImputer( max_iter=max_iter).fit_transform(X_incomplete) return X_filled_mice elif method == 'spectral': # default value for the sparsity level is with respect to the maximum singular value, # this is now done in a heuristic way sparsity = [0.5, None, 3][level] X_filled_spectral = SoftImpute( shrinkage_value=sparsity).fit_transform(X_incomplete) return X_filled_spectral else: raise NotImplementedError
def fancyimpute_matrix_completion(function, gram_drop, seqs=None, sigma=None, triangular=None, num_process=4, drop_flag_matrix=None): gram_partially_completed_by_gak = gak.gram_gak(seqs, sigma=sigma, triangular=triangular, num_process=num_process, drop_flag_matrix=drop_flag_matrix) for i in range(len(gram_drop)): gram_drop[i, i] = 1 for j in range(len(gram_drop[0])): if np.isnan(gram_partially_completed_by_gak[i, j]): continue assert np.isnan(gram_drop[i, j]) gram_drop[i, j] = gram_partially_completed_by_gak[i, j] if function == "SoftImpute": gram_completed = SoftImpute().complete(gram_drop) elif function == "KNN": gram_completed = KNN().complete(gram_drop) elif function == "IterativeSVD": gram_completed = IterativeSVD().complete(gram_drop) else: print("unsupported fancyimpute functin") exit(-1) return gram_completed
def fit(self, trainset): AlgoBase.fit(self, trainset) X_incomplete = np.nan * np.zeros((trainset.n_users, trainset.n_items)) for u, i, r in trainset.all_ratings(): X_incomplete[u, i] = r soft_impute = SoftImpute(shrinkage_value=self.lmbda, max_iters=self.max_iter, max_rank=self.max_rank, min_value=self.min_value, max_value=self.max_value, verbose=self.verbose) X_filled_normalized \ = soft_impute.fit_transform(X_incomplete) self.predictions = X_filled_normalized return self
def impute(data): row_bool = ~np.all(np.isnan(data), axis=1) col_bool = ~np.all(np.isnan(data), axis=0) data_filtered = data[row_bool, :][:, col_bool] data_imputed = SoftImpute().fit_transform(data_filtered) tmp = np.zeros([data_filtered.shape[0], data.shape[1]]) tmp[:, col_bool] = data_imputed data[row_bool, :] = tmp data[np.isnan(data)] = 0
def filtering(food_list, food_a, food_b): df = pd.read_csv('./resource/meal_problem/final_rating_data.csv') df = df.iloc[:, 1:] df = df.append(build_new_row(food_list, food_a), ignore_index=True) df = df.append(build_new_row(food_list, food_b), ignore_index=True) df_numeric = df.select_dtypes(include=[np.float]).to_numpy() df_new = pd.DataFrame(SoftImpute().fit_transform(df_numeric)) df_new.columns = df.columns return df_new
def impute(self, trained_model, input): """ Loads the input table and gives the imputed table :param trained_model: trained model returned by train function - not used in our case :param input: input table which needs to be imputed :return: X_filled_softimpute: imputed table as a numpy array """ X_incomplete = input softImpute = SoftImpute() biscaler = BiScaler() X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) return X_filled_softimpute
def softimp(img, maskp): """Preprocessing with Soft-Impute approach. Data matrix is scaled between [-1, 1] before matrix estimation (and rescaled back after ME) [Mazumder, R. et al. Spectral regularization algorithms for learning large incomplete matrices. 2010.] :param img: original image :param maskp: observation probability of each entry in mask matrix :return: preprocessed image """ h, w, c = img.shape img = img.astype('float64') * 2 / 255 - 1 if args.me_channel == 'concat': img = img.transpose(2, 0, 1) img = np.concatenate((np.concatenate( (img[0], img[1]), axis=1), img[2]), axis=1) mask = np.random.binomial(1, maskp, h * w * c).reshape(h, w * c).astype(float) mask[mask < 1] = np.nan W = SoftImpute(verbose=False).fit_transform(mask * img) W[W < -1] = -1 W[W > 1] = 1 est_matrix = (W + 1) * 255 / 2 outputs = np.zeros((h, w, c)) for channel in range(c): outputs[:, :, channel] = est_matrix[:, channel * w:(channel + 1) * w] else: mask = np.random.binomial(1, maskp, h * w).reshape(h, w).astype(float) mask[mask < 1] = np.nan outputs = np.zeros((h, w, c)) for channel in range(c): mask_img = img[:, :, channel] * mask W = SoftImpute(verbose=False).fit_transform(mask_img) W[W < -1] = -1 W[W > 1] = 1 outputs[:, :, channel] = (W + 1) * 255 / 2 return outputs
def fancy_predict(train, test_data_points, max_rank=8, shrinkage_value=0.02, max_iters=50): ''' Generates predictions for test data points using FancyImpute's dense implementation of SoftImpute. ''' train, rowscale, colscale, rowcenter, colcenter = fancy_biscale(train) train[train == 0] = np.nan si = SoftImpute(shrinkage_value=shrinkage_value, max_rank=max_rank, max_iters=max_iters, init_fill_method='zero', verbose=False) complete = si.complete(train) targets = zip(test_data_points[0], test_data_points[1]) res = [] for idx, (r, c) in enumerate(targets): res.append((complete[r, c], r, c)) res = fancy_remove_biscale(res, rowscale, colscale, rowcenter, colcenter) return res
def construct_low_rank_imputer(method, k): clf = None if method == "SoftImpute": clf = SoftImpute(max_rank=k, verbose=False) elif method == "KNN": clf = KNN(k=k, verbose=False) elif method == 'II': clf = IterativeImputer(min_value=0) else: raise ("Not implemented") return clf
def test_estimators(X, y, dum_enc, classification=True): ModeMeanImputer = create_mode_mean_imputer(X, dum_enc) # List with all imputation algorithms to test, in tuples of (name, estimator object, inductive) impute_estimators = [ ("ModeMeanImputer", ModeMeanImputer, True), ("KNNImputer", KNNImputer(), True), ("Iter_BayesianRidge", IterativeImputer(estimator=BayesianRidge(), random_state=0), True), ("Iter_DecisionTree", IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0), random_state=0), True), ("Iter_RF", IterativeImputer(estimator=RandomForestRegressor(n_estimators=100, random_state=0), random_state=0), True), ("Iter_ExtraTrees", IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100, random_state=0), random_state=0), True), ("Iter_KNRegr", IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=15), random_state=0), True), ("Iter_SVD", IterativeSVD(rank=min(min(X.shape) - 1, 10), verbose=False), False), ("SoftImpute", SoftImpute(verbose=False), False) ] imp_scores = {} times = {} if not classification: for estimator_name, impute_estimator, inductive in impute_estimators: time1 = time.time() imp_scores[estimator_name] = imputation_score_regression( X, y, estimator_name, impute_estimator, inductive) time2 = time.time() times[estimator_name] = time2 - time1 #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds") if classification: for estimator_name, impute_estimator, inductive in impute_estimators: time1 = time.time() imp_scores[estimator_name] = imputation_score_classification( X, y, estimator_name, impute_estimator, inductive) time2 = time.time() times[estimator_name] = time2 - time1 #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds") imputer_dict = {} for estimator_name, impute_estimator, inductive in impute_estimators: imputer_dict[estimator_name] = impute_estimator return imp_scores, times, imputer_dict
def fi_complete(self, X, method='mf', **params): if method == 'mf': #rank = params['rank']=100 self.X_filled = MatrixFactorization(params['rank']).complete(X) if method == 'knn': # Use 3 nearest rows which have a feature to fill in each row's missing features #k = params['k'] = 3 self.X_filled = KNN(params['k']).complete(X) if method == 'soft': # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding self.X_filled = SoftImpute().complete(X)
def fancyImputeAttempts(data, dataframe): data = np.array(data, np.float) #use fancy impute package filled_knn = KNN(k=3, verbose=False).complete(data) filled_softimpute = SoftImpute(verbose=False).complete(data) filled_svd = IterativeSVD(verbose=False).complete(data) print "\nKNN computations\n" doiteration(filled_knn, dataframe) print "\n SOFTIMPUTE computations\n" doiteration(filled_softimpute, dataframe) print "\n SVD computations\n" doiteration(filled_svd, dataframe)
def isvt(self): """ Matrix completion is done using the softimpute function in the fancyimpute library. """ #the fancyimpute library requires that all the sparse elements in the sparse matrix be NaN. #so, the zeroes are converted accordingly rating = self.sparse.copy() rating[np.where(rating == 0)] = np.nan #the sparse matrix is then filled filled = SoftImpute(max_iters=100).fit_transform(rating) return (rating, filled)
def __init__(self, method, **kwargs): self.clf = None self.method = method if method == "SoftImpute": self.clf = SoftImpute(**kwargs) elif method == "KNN": self.clf = KNN(**kwargs) elif method == "Naive": self.clf = SimpleFill() elif method == 'II': raise ('NOT TESTED') self.clf = IterativeImputer(min_value=0) else: raise ("Not Implemented method")
def netPred(self, method='mf', dim=100, alpha=0.1): ''' supported methods: mf, cf, mnmf, fancy_nnm, fancy_soft ''' if method == 'mf': model = NMF(n_components=dim, alpha=alpha, l1_ratio=0.2) W = model.fit_transform(self.mat) H = model.components_ self.pred = np.matmul(W, H) elif method == 'cf': model = implicit.als.AlternatingLeastSquares(factors=dim, regularization=alpha) model.fit(self.mat) self.pred = np.matmul(model.item_factors, model.user_factors.T) elif method == 'mnmf': self.pred = mnmf(self.mat, dim, alpha) elif 'fancy' in method: X = self.mat.toarray().astype(np.float) X[X == 0] = np.nan if 'nnm' in method: self.pred = NuclearNormMinimization( error_tolerance=0.01).complete(X) elif 'soft' in method: self.pred = SoftImpute().complete(X)
def train_models(self, minibatch_size=32): memory_arr = np.array(self.memory) file = "memoryBW.npy" np.save(file, memory_arr) if self.partial_obs_rate > 0: self.make_mem_partial_obs(memory_arr) file1 = "memoryBWcorrupted.npy" np.save(file1, memory_arr) print("Memory size:") print(memory_arr.size) print("Proportion of missing values:") print(np.isnan(memory_arr).sum() / memory_arr.size) #memory_train = np.array([exp for exp in memory_arr if not np.isnan(exp[-self.state_size - 1:-1]).any()]) #imputer = Imputer() #memory_final = imputer.fit_transform(memory_train) memory_final = SoftImpute().complete(memory_arr) file2 = "memoryBWimputedSoft.npy" np.save(file2, memory_final) else: memory_final = memory_arr if self.useRNN: batch_size = len(memory_final) minibatch_size = min(minibatch_size, batch_size) t_x, t_y = self.setup_batch_for_RNN(memory_final) self.tmodel.fit(t_x, t_y, batch_size=minibatch_size, epochs=self.net_train_epochs, validation_split=0.1, callbacks=self.Ttensorboard, verbose=1) else: batch_size = len(memory_arr) minibatch_size = min(minibatch_size, batch_size) # batch = random.sample(list(memory_final), minibatch_size) # batch = np.array(batch) # batch = memory_arr t_x = memory_final[:, :self.state_size + self.action_size] t_y = memory_final[:, -self.state_size - 1:-1] self.tmodel.fit(t_x, t_y, batch_size=minibatch_size, epochs=self.net_train_epochs, validation_split=0.1, callbacks=self.Ttensorboard, verbose=1) '''
def get_imputer(imputer_name, **add_params): imputer_name = imputer_name.lower() if imputer_name == 'knn': return KNN(**add_params) elif imputer_name.lower() == 'nnm': return NuclearNormMinimization(**add_params) elif imputer_name == 'soft': return SoftImpute(**add_params) elif imputer_name == 'iterative': return IterativeImputer(**add_params) elif imputer_name == 'biscaler': return BiScaler(**add_params) else: print('Choose one of predefined imputers')
def fill_row(data): for i in range(data.shape[1]): if np.isnan(data[0,i]): data[0,i] = averages["avg_" + str(i)] tmp = np.zeros((1, data.shape[1])) tmp[:] = np.nan data = np.concatenate((data, tmp)) for i in range(data.shape[1]): if i%2 == 1: tmp = data[0,i] data[0,i] = data[1,i] data[1,i] = tmp data_normalized = BiScaler(verbose=False).fit_transform(data) data_filled = SoftImpute(verbose=False).fit_transform(data_normalized) data_filled = np.delete(data_filled, 1, 0) return data_filled
def clean_input(data): cols = data.shape[1] for i in range(cols): curr = data[:,i] nans = np.isnan(curr) if not False in nans: data[0,i] = averages["avg_" + str(i)] if data.shape[0] == 1: norm = np.linalg.norm(data) if norm == 0: return data else: return data / norm data_normalized = BiScaler(verbose=False).fit_transform(data) data_filled = SoftImpute(verbose=False).fit_transform(data_normalized) return data_filled
X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean() print("meanFill MSE: %f" % meanfill_mse)