def __init__(self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array([images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print( "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape( self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print( "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % (self.incomplete_images.shape, )) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler(scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False)
def complex_imputation(df, method='mice', neighbors=3): """ Inputs: df -- dataframe of incomplete data method -- method of imputation - 'knn': Imputes using K Nearest Neighbors of completed rows - 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions - 'mice': Imputes using Multiple Imputation by Chained Equations method - 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method - 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V with L1 sparsity on U elements and L2 sparsity on V elements - 'iterative_svd': Imputes based on iterative low-rank SVD decomposition neighbors -- parameter for KNN imputation Output: Completed matrix """ # Create matrix of features X_incomplete = df.values # Normalize matrix by std and mean (0 mean, 1 variance) X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) if method == 'knn': X_complete = KNN(neighbors).complete(X_incomplete) return fill_values(df, X_complete) if method == 'soft_impute': X_complete_normalized = SoftImpute().complete(X_incomplete_normalized) X_complete = BiScaler().inverse_transform(X_complete_normalized) return fill_values(df, X_complete) if method == 'mice': X_complete = MICE().complete(X_incomplete) return fill_values(df, X_complete) if method == 'nuclear_nm': X_complete = NuclearNormMinimization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'matrix_factorization': X_complete = MatrixFactorization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'iterative_svd': X_complete = IterativeSVD().complete(X_incomplete) return fill_values(df, X_complete)
def __init__( self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array( [images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % ( self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape(self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % ( n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % ( self.incomplete_images.shape,)) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler( scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False)
def impute(self, trained_model, input): """ Loads the input table and gives the imputed table :param trained_model: trained model returned by train function - not used in our case :param input: input table which needs to be imputed :return: X_filled_softimpute: imputed table as a numpy array """ X_incomplete = input softImpute = SoftImpute() biscaler = BiScaler() X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) return X_filled_softimpute
def get_imputer(imputer_name, **add_params): imputer_name = imputer_name.lower() if imputer_name == 'knn': return KNN(**add_params) elif imputer_name.lower() == 'nnm': return NuclearNormMinimization(**add_params) elif imputer_name == 'soft': return SoftImpute(**add_params) elif imputer_name == 'iterative': return IterativeImputer(**add_params) elif imputer_name == 'biscaler': return BiScaler(**add_params) else: print('Choose one of predefined imputers')
def clean_input(data): cols = data.shape[1] for i in range(cols): curr = data[:,i] nans = np.isnan(curr) if not False in nans: data[0,i] = averages["avg_" + str(i)] if data.shape[0] == 1: norm = np.linalg.norm(data) if norm == 0: return data else: return data / norm data_normalized = BiScaler(verbose=False).fit_transform(data) data_filled = SoftImpute(verbose=False).fit_transform(data_normalized) return data_filled
def fill_row(data): for i in range(data.shape[1]): if np.isnan(data[0,i]): data[0,i] = averages["avg_" + str(i)] tmp = np.zeros((1, data.shape[1])) tmp[:] = np.nan data = np.concatenate((data, tmp)) for i in range(data.shape[1]): if i%2 == 1: tmp = data[0,i] data[0,i] = data[1,i] data[1,i] = tmp data_normalized = BiScaler(verbose=False).fit_transform(data) data_filled = SoftImpute(verbose=False).fit_transform(data_normalized) data_filled = np.delete(data_filled, 1, 0) return data_filled
100.0 * observed_mask.sum() / X.size)) if args.save_incomplete_affinity_matrix: print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) df = pd.DataFrame(X, columns=allele_list, index=peptide_list) df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") scores = ScoreSet() kfold = stratified_cross_validation( X=X, observed_mask=observed_mask, n_folds=args.n_folds) for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold): X_fold_reduced = X_fold[ok_mesh] biscaler = BiScaler( scale_rows=args.normalize_rows, center_rows=args.normalize_rows, scale_columns=args.normalize_columns, center_columns=args.normalize_columns) X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced) for (method_name, solver) in sorted(imputation_methods.items()): print("CV fold %d/%d, running %s" % ( fold_idx + 1, args.n_folds, method_name)) X_completed_reduced_scaled = solver.complete(X_fold_reduced) X_completed_reduced = biscaler.inverse_transform( X_completed_reduced_scaled) X_completed = np.zeros_like(X) X_completed[ok_mesh] = X_completed_reduced y_pred = X_completed[test_coords] mae, tau, auc, f1_score = evaluate_predictions(
class ResultsTable(object): def __init__(self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array([images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print( "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape( self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print( "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % (self.incomplete_images.shape, )) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler(scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False) def ensure_dir(self, dirname): if not exists(dirname): print("Creating directory: %s" % dirname) mkdir(dirname) def save_images(self, images, base_filename, flattened=True): self.ensure_dir(self.dirname) for i in self.saved_image_indices: label = self.labels[i].lower().replace(" ", "_") image = images[i, :].copy() if flattened: image = image.reshape(self.image_shape) image[np.isnan(image)] = 0 figure = pylab.gcf() axes = pylab.gca() extra_kwargs = {} if self.color: extra_kwargs["cmap"] = "gray" assert image.min() >= 0, "Image can't contain negative numbers" if image.max() <= 1: image *= 256 image[image > 255] = 255 axes.imshow(image.astype("uint8"), **extra_kwargs) axes.get_xaxis().set_visible(False) axes.get_yaxis().set_visible(False) filename = base_filename + ".png" subdir = join(self.dirname, label) self.ensure_dir(subdir) path = join(subdir, filename) figure.savefig(path, bbox_inches='tight') self.saved_images[i][base_filename] = path def add_entry(self, solver, name): print("Running %s" % name) completed_normalized = solver.fit_transform(self.incomplete_normalized) completed = self.normalizer.inverse_transform(completed_normalized) mae = masked_mae(X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) mse = masked_mse(X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae)) self.mse_dict[name] = mse self.mae_dict[name] = mae self.save_images(completed, base_filename=name) def sorted_errors(self): """ Generator for (rank, name, MSE, MAE) sorted by increasing MAE """ for i, (name, mae) in enumerate( sorted(self.mae_dict.items(), key=lambda x: x[1])): yield ( i + 1, name, self.mse_dict[name], self.mae_dict[name], ) def print_sorted_errors(self): for (rank, name, mse, mae) in self.sorted_errors(): print("%d) %s: MSE=%0.4f MAE=%0.4f" % (rank, name, mse, mae)) def save_html_table(self, filename="results_table.html"): html = """ <table> <th> <td>Rank</td> <td>Name</td> <td>Mean Squared Error</td> <td>Mean Absolute Error</td> </th> """ for (rank, name, mse, mae) in self.sorted_errors(): html += """ <tr> <td>%d</td> <td>%s</td> <td>%0.4f</td> <td>%0.4f</td> </tr> """ % (rank, name, mse, mae) html += "</table>" self.ensure_dir(self.dirname) path = join(self.dirname, filename) with open(path, "w") as f: f.write(html) return html
'random_state': 71, 'silent': -1, 'verbose': -1, 'n_jobs': -1, } fit_params = { 'eval_metric': 'auc', 'early_stopping_rounds': 150, 'verbose': 100 } with timer('impute missing'): df = pd.concat([X_train, X_test], axis=0) df = df.loc[:, df.isnull().sum() != len(df)] cols = [f for f in df.columns if df[f].dtype != 'object'] bi = BiScaler() df[cols] = bi.fit_transform(df[cols].values) df.fillna(-9999, inplace=True) X_train = df[:len(X_train)].copy() X_test = df[len(X_train):].copy() del bi, df, cols gc.collect() with timer('training'): cv_results = [] val_series = y_train.copy() test_df = pd.DataFrame() feat_df = None for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx]
# Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask])**2).mean() print("meanFill MSE: %f" % meanfill_mse) # print mean squared error for the imputation methods above
try: imputedData = mice.MICE().complete(missData) score = evaluate.RMSE(originData, imputedData) mice_rmse.append(score) logger.info("MICE missing rate:{},RMSE:{}".format(i, score)) except: mice_rmse.append(np.nan) try: imputedData = EM().complete(missData) score = evaluate.RMSE(originData, imputedData) em_rmse.append(score) logger.info("EM missing rate:{},RMSE:{}".format(i, score)) except: em_rmse.append(np.nan) try: imputedData = BiScaler().fit_transform(missData) imputedData = SoftImpute().fit_transform(imputedData) score = evaluate.RMSE(originData, imputedData) fi_bs_rmse.append(score) logger.info("fi BiScaler missing rate:{},RMSE:{}".format( i, score)) except: fi_bs_rmse.append(np.nan) try: imputedData = SoftImpute().fit_transform(missData) score = evaluate.RMSE(originData, imputedData) fi_si_rmse.append(score) logger.info("fi SoftImpute missing rate:{},RMSE:{}".format( i, score)) except: fi_si_rmse.append(np.nan)
# set random seed 2 ways cause I'm not sure what's appropriate, my suspicion # is numpy random.seed(123) np.random.seed(123) # read in data and transpose data = pd.read_csv(input_file, sep='\t', header=0, index_col=0, error_bad_lines=False) new_data = data.copy() transposed = new_data.T # we'll need a matrix specifically for the biscaler transform, for SoftImpute print("SoftImpute...") transposed_mat = transposed.as_matrix() biscaler = BiScaler() # perform the scaling appropriate for this imputation strategy transposed_normalized = biscaler.fit_transform(transposed_mat) # the imputation itself imputed_softimpute = SoftImpute().fit_transform(transposed_normalized) # we don't want the transformed values and we want samples to be columns inverse_softimpute = biscaler.inverse_transform(imputed_softimpute) untransposed_softimpute = inverse_softimpute.transpose() # prepare to write to file, back to DataFrame, return indices softimpute_df = pd.DataFrame(untransposed_softimpute) softimpute_df.index = data.index softimpute_df.columns = data.columns.values
# Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean() print("meanFill MSE: %f" % meanfill_mse) # print mean squared error for the three imputation methods above nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() print("Nuclear norm minimization MSE: %f" % nnm_mse)
min_improvement=1e-6).fit_transform(X_incomplete) # matrix completion using Mean Fill X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete) # matrix completion using Median Fill X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete) # matrix completion using Zero Fill X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete) # matrix completion using Min Fill X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete) # matrix completion using Sampled Fill X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized) # print mean squared error for the imputation methods above mice_mse = ((X_filled_mice[missing_mask] - X[missing_mask]) ** 2).mean() print("MICE MSE: %f" % mice_mse) svd_mse = ((X_filled_svd[missing_mask] - X[missing_mask]) ** 2).mean() print("SVD MSE: %f" % svd_mse) mf_mse = ((X_filled_mf[missing_mask] - X[missing_mask]) ** 2).mean() print("Matrix Factorization MSE: %f" % mf_mse) meanfill_mse = ((X_filled_meanfill[missing_mask] - X[missing_mask]) ** 2).mean() print("MeanImpute MSE: %f" % meanfill_mse)
def main( input_path: str = '/project/lindner/air-pollution/level3_data/', input_prefix: str = "Data_", input_suffix: str = "", output_path: str = '/project/lindner/air-pollution/current/2019/data-formatted/houston', year_begin: int = 2000, year_end: int = 2018, fillgps: bool = False, naninvalid: bool = False, dropnan: bool = False, masknan: float = None, fillnan: float = None, aqsnumerical: bool = False, houston: bool = False, chunksize: int = 200000): data1 = pd.read_csv( "/project/lindner/air-pollution/current/2019/data-formatted/concat_aqs/Transformed_Data_48_201_0695.csv" ) data2 = pd.read_csv( "/project/lindner/air-pollution/current/2019/data-formatted/concat_aqs/Transformed_Data_48_201_0416.csv" ) #Goal is to impute Park Place o3 from all other features y = data2['o3'] data1 = data1.add_prefix('MoodyTowers_') data2 = data2.drop(['o3'], axis='columns').add_prefix('ParkPlace_') #Because of unneeded columns leftover from faulty script data1X = data1.replace('48_201_0695', 0) data2X = data2.replace('48_201_0416', 1) X = pd.concat([data1X, data2X], ignore_index=True) #X, y = X.dropna(), y.dropna() X = X.dropna(how='all', axis='columns') X, y = np.array(X), np.array(y) scaler = MinMaxScaler() X = BiScaler().fit_transform(X) X = SoftImpute().fit_transform(X) y = BiScaler().fit_transform(y) y = SoftImpute().fit_transform(y) #X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # Initialising the RNN regressor = Sequential() #Layers regressor.add( Dense(25, input_dim=21, activation='relu', kernel_initializer='he_uniform')) regressor.add(Dropout(0.2)) regressor.add( Dense(25, input_dim=21, activation='relu', kernel_initializer='he_uniform')) regressor.add(Dropout(0.2)) regressor.add( Dense(25, input_dim=21, activation='relu', kernel_initializer='he_uniform')) regressor.add(Dropout(0.2)) # Adding the output layer regressor.add(Dense(units=1)) # Compiling the RNN regressor.compile(optimizer='adam', loss='mean_squared_error') # Fitting the RNN to the Training set history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, verbose=1) # evaluate the model train_mse = model.evaluate(X_train, y_train, verbose=0) test_mse = model.evaluate(X_test, y_test, verbose=0) print('Train: %.3f, Test: %.3f' % (train_mse, test_mse)) # plot loss during training pyplot.title('Loss / Mean Squared Error') pyplot.plot(history.history['loss'], label='train') pyplot.plot(history.history['val_loss'], label='test') pyplot.legend() pyplot.show() pyplot.savefig(output_path + "MSE_of_LSTM_model.png") regressor.save(output_path + "model.h5")
class ResultsTable(object): def __init__( self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array( [images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % ( self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape(self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % ( n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % ( self.incomplete_images.shape,)) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler( scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False) def ensure_dir(self, dirname): if not exists(dirname): print("Creating directory: %s" % dirname) mkdir(dirname) def save_images(self, images, base_filename, flattened=True): self.ensure_dir(self.dirname) for i in self.saved_image_indices: label = self.labels[i].lower().replace(" ", "_") image = images[i, :].copy() if flattened: image = image.reshape(self.image_shape) image[np.isnan(image)] = 0 figure = pylab.gcf() axes = pylab.gca() extra_kwargs = {} if self.color: extra_kwargs["cmap"] = "gray" assert image.min() >= 0, "Image can't contain negative numbers" if image.max() <= 1: image *= 256 image[image > 255] = 255 axes.imshow(image.astype("uint8"), **extra_kwargs) axes.get_xaxis().set_visible(False) axes.get_yaxis().set_visible(False) filename = base_filename + ".png" subdir = join(self.dirname, label) self.ensure_dir(subdir) path = join(subdir, filename) figure.savefig( path, bbox_inches='tight') self.saved_images[i][base_filename] = path def add_entry(self, solver, name): print("Running %s" % name) completed_normalized = solver.complete(self.incomplete_normalized) completed = self.normalizer.inverse_transform(completed_normalized) mae = masked_mae( X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) mse = masked_mse( X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae)) self.mse_dict[name] = mse self.mae_dict[name] = mae self.save_images(completed, base_filename=name) def sorted_errors(self): """ Generator for (rank, name, MSE, MAE) sorted by increasing MAE """ for i, (name, mae) in enumerate( sorted(self.mae_dict.items(), key=lambda x: x[1])): yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],) def print_sorted_errors(self): for (rank, name, mse, mae) in self.sorted_errors(): print("%d) %s: MSE=%0.4f MAE=%0.4f" % ( rank, name, mse, mae)) def save_html_table(self, filename="results_table.html"): html = """ <table> <th> <td>Rank</td> <td>Name</td> <td>Mean Squared Error</td> <td>Mean Absolute Error</td> </th> """ for (rank, name, mse, mae) in self.sorted_errors(): html += """ <tr> <td>%d</td> <td>%s</td> <td>%0.4f</td> <td>%0.4f</td> </tr> """ % (rank, name, mse, mae) html += "</table>" self.ensure_dir(self.dirname) path = join(self.dirname, filename) with open(path, "w") as f: f.write(html) return html
def impute(data, method='mean', value=None, nan_value=np.nan): """ Impute missing values on a numpy ndarray in a column-wise manner. ANTsR function: `antsrimpute` Arguments --------- data : numpy.ndarray data to impute method : string or float type of imputation method to use Options: mean median constant KNN BiScaler NuclearNormMinimization SoftImpute IterativeSVD value : scalar (optional) optional arguments for different methods if method == 'constant' constant value if method == 'KNN' number of nearest neighbors to use nan_value : scalar value which is interpreted as a missing value Returns ------- ndarray if ndarray was given OR pd.DataFrame if pd.DataFrame was given Example ------- >>> import ants >>> import numpy as np >>> data = np.random.randn(4,10) >>> data[2,3] = np.nan >>> data[3,5] = np.nan >>> data_imputed = ants.impute(data, 'mean') Details ------- KNN: Nearest neighbor imputations which weights samples using the mean squared difference on features for which two rows both have observed data. SoftImpute: Matrix completion by iterative soft thresholding of SVD decompositions. Inspired by the softImpute package for R, which is based on Spectral Regularization Algorithms for Learning Large Incomplete Matrices by Mazumder et. al. IterativeSVD: Matrix completion by iterative low-rank SVD decomposition. Should be similar to SVDimpute from Missing value estimation methods for DNA microarrays by Troyanskaya et. al. MICE: Reimplementation of Multiple Imputation by Chained Equations. MatrixFactorization: Direct factorization of the incomplete matrix into low-rank U and V, with an L1 sparsity penalty on the elements of U and an L2 penalty on the elements of V. Solved by gradient descent. NuclearNormMinimization: Simple implementation of Exact Matrix Completion via Convex Optimization by Emmanuel Candes and Benjamin Recht using cvxpy. Too slow for large matrices. BiScaler: Iterative estimation of row/column means and standard deviations to get doubly normalized matrix. Not guaranteed to converge but works well in practice. Taken from Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares. """ _fancyimpute_options = { 'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute', 'IterativeSVD' } if (not has_fancyimpute) and (method in _fancyimpute_options): raise ValueError( 'You must install `fancyimpute` (pip install fancyimpute) to use this method' ) _base_options = {'mean', 'median', 'constant'} if (method not in _base_options) and ( method not in _fancyimpute_options) and (not isinstance( method, (int, float))): raise ValueError( 'method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`' ) X_incomplete = data.copy() if method == 'KNN': if value is None: value = 3 X_filled = KNN(k=value, verbose=False).complete(X_incomplete) elif method == 'BiScaler': X_filled = BiScaler(verbose=False).fit_transform(X_incomplete) elif method == 'SoftImpute': X_filled = SoftImpute(verbose=False).complete(X_incomplete) elif method == 'IterativeSVD': if value is None: rank = min(10, X_incomplete.shape[0] - 2) else: rank = value X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete) elif method == 'mean': col_means = np.nanmean(X_incomplete, axis=0) for i in range(X_incomplete.shape[1]): X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i] X_filled = X_incomplete elif method == 'median': col_means = np.nanmean(X_incomplete, axis=0) for i in range(X_incomplete.shape[1]): X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i] X_filled = X_incomplete elif method == 'constant': if value is None: raise ValueError( 'Must give `value` argument if method == constant') X_incomplete[np.isnan(X_incomplete)] = value X_filled = X_incomplete return X_filled
masked_ii = train_x_ii[mask, 2] # Use 3 nearest rows which have a feature to fill # in each row's missing features train_x_knn = KNN(k=3, verbose=False).fit_transform(train_x) masked_knn = train_x_knn[mask, 2] # matrix completion using convex optimization to find low-rank solution # that still matches observed values. # Slow! # train_x_nnm = NuclearNormMinimization().fit_transform(train_x) # imp_nnm = train_x_nnm[train_x.isnull().values] # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding train_x_normalized = BiScaler(verbose=False).fit_transform(train_x) train_x_softimpute = SoftImpute(verbose=False).fit_transform( train_x_normalized) masked_soft = train_x_softimpute[mask, 2] # print mean squared error for the four imputation methods above ii_mse = ((masked_ii - masked_x) ** 2).mean() knn_mse = ((masked_knn - masked_x) ** 2).mean() soft_mse = ((masked_soft - masked_x) ** 2).mean() lrcv.fit(train_x_ii, train_y) print("Iterative Imputer\nImputed MSE : {:5f}".format(ii_mse)) print('Ridge alpha : {}'.format(lrcv.alpha_)) ridge = Ridge(alpha=lrcv.alpha_, random_state=SEED + i) cvs = cross_val_score(ridge, train_x_ii, train_y,