def impute(self, trained_model, input): """ Loads the input table and gives the imputed table :param trained_model: trained model returned by train function - not used in our case :param input: input table which needs to be imputed :return: X_filled_softimpute: imputed table as a numpy array """ X_incomplete = input softImpute = SoftImpute() biscaler = BiScaler() X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) return X_filled_softimpute
X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.fit_transform( X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask])**2).mean() print("meanFill MSE: %f" % meanfill_mse) # print mean squared error for the imputation methods above nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask])**2).mean() print("Nuclear norm minimization MSE: %f" % nnm_mse)
df = pd.DataFrame(X, columns=allele_list, index=peptide_list) df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") scores = ScoreSet() kfold = stratified_cross_validation( X=X, observed_mask=observed_mask, n_folds=args.n_folds) for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold): X_fold_reduced = X_fold[ok_mesh] biscaler = BiScaler( scale_rows=args.normalize_rows, center_rows=args.normalize_rows, scale_columns=args.normalize_columns, center_columns=args.normalize_columns) X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced) for (method_name, solver) in sorted(imputation_methods.items()): print("CV fold %d/%d, running %s" % ( fold_idx + 1, args.n_folds, method_name)) X_completed_reduced_scaled = solver.complete(X_fold_reduced) X_completed_reduced = biscaler.inverse_transform( X_completed_reduced_scaled) X_completed = np.zeros_like(X) X_completed[ok_mesh] = X_completed_reduced y_pred = X_completed[test_coords] mae, tau, auc, f1_score = evaluate_predictions( y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50) scores.add_many( method_name,
class ResultsTable(object): def __init__(self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array([images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print( "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape( self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print( "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % (self.incomplete_images.shape, )) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler(scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False) def ensure_dir(self, dirname): if not exists(dirname): print("Creating directory: %s" % dirname) mkdir(dirname) def save_images(self, images, base_filename, flattened=True): self.ensure_dir(self.dirname) for i in self.saved_image_indices: label = self.labels[i].lower().replace(" ", "_") image = images[i, :].copy() if flattened: image = image.reshape(self.image_shape) image[np.isnan(image)] = 0 figure = pylab.gcf() axes = pylab.gca() extra_kwargs = {} if self.color: extra_kwargs["cmap"] = "gray" assert image.min() >= 0, "Image can't contain negative numbers" if image.max() <= 1: image *= 256 image[image > 255] = 255 axes.imshow(image.astype("uint8"), **extra_kwargs) axes.get_xaxis().set_visible(False) axes.get_yaxis().set_visible(False) filename = base_filename + ".png" subdir = join(self.dirname, label) self.ensure_dir(subdir) path = join(subdir, filename) figure.savefig(path, bbox_inches='tight') self.saved_images[i][base_filename] = path def add_entry(self, solver, name): print("Running %s" % name) completed_normalized = solver.fit_transform(self.incomplete_normalized) completed = self.normalizer.inverse_transform(completed_normalized) mae = masked_mae(X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) mse = masked_mse(X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae)) self.mse_dict[name] = mse self.mae_dict[name] = mae self.save_images(completed, base_filename=name) def sorted_errors(self): """ Generator for (rank, name, MSE, MAE) sorted by increasing MAE """ for i, (name, mae) in enumerate( sorted(self.mae_dict.items(), key=lambda x: x[1])): yield ( i + 1, name, self.mse_dict[name], self.mae_dict[name], ) def print_sorted_errors(self): for (rank, name, mse, mae) in self.sorted_errors(): print("%d) %s: MSE=%0.4f MAE=%0.4f" % (rank, name, mse, mae)) def save_html_table(self, filename="results_table.html"): html = """ <table> <th> <td>Rank</td> <td>Name</td> <td>Mean Squared Error</td> <td>Mean Absolute Error</td> </th> """ for (rank, name, mse, mae) in self.sorted_errors(): html += """ <tr> <td>%d</td> <td>%s</td> <td>%0.4f</td> <td>%0.4f</td> </tr> """ % (rank, name, mse, mae) html += "</table>" self.ensure_dir(self.dirname) path = join(self.dirname, filename) with open(path, "w") as f: f.write(html) return html
'silent': -1, 'verbose': -1, 'n_jobs': -1, } fit_params = { 'eval_metric': 'auc', 'early_stopping_rounds': 150, 'verbose': 100 } with timer('impute missing'): df = pd.concat([X_train, X_test], axis=0) df = df.loc[:, df.isnull().sum() != len(df)] cols = [f for f in df.columns if df[f].dtype != 'object'] bi = BiScaler() df[cols] = bi.fit_transform(df[cols].values) df.fillna(-9999, inplace=True) X_train = df[:len(X_train)].copy() X_test = df[len(X_train):].copy() del bi, df, cols gc.collect() with timer('training'): cv_results = [] val_series = y_train.copy() test_df = pd.DataFrame() feat_df = None for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx].copy()
class ResultsTable(object): def __init__( self, images_dict, percent_missing=0.25, saved_image_stride=25, dirname="face_images", scale_rows=False, center_rows=False): self.images_dict = images_dict self.labels = list(sorted(images_dict.keys())) self.images_array = np.array( [images_dict[k] for k in self.labels]).astype("float32") self.image_shape = self.images_array[0].shape self.width, self.height = self.image_shape[:2] self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3) if self.color: self.images_array = color_balance(self.images_array) self.n_pixels = self.width * self.height self.n_features = self.n_pixels * (3 if self.color else 1) self.n_images = len(self.images_array) print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % ( self.n_images, self.color, self.n_features, self.image_shape)) self.flattened_array_shape = (self.n_images, self.n_features) self.flattened_images = self.images_array.reshape(self.flattened_array_shape) n_missing_pixels = int(self.n_pixels * percent_missing) missing_square_size = int(np.sqrt(n_missing_pixels)) print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % ( n_missing_pixels, missing_square_size)) self.incomplete_images = remove_pixels( self.images_array, missing_square_size=missing_square_size) print("[ResultsTable] Incomplete images shape = %s" % ( self.incomplete_images.shape,)) self.flattened_incomplete_images = self.incomplete_images.reshape( self.flattened_array_shape) self.missing_mask = np.isnan(self.flattened_incomplete_images) self.normalizer = BiScaler( scale_rows=scale_rows, center_rows=center_rows, min_value=self.images_array.min(), max_value=self.images_array.max()) self.incomplete_normalized = self.normalizer.fit_transform( self.flattened_incomplete_images) self.saved_image_indices = list( range(0, self.n_images, saved_image_stride)) self.saved_images = defaultdict(dict) self.dirname = dirname self.mse_dict = {} self.mae_dict = {} self.save_images(self.images_array, "original", flattened=False) self.save_images(self.incomplete_images, "incomplete", flattened=False) def ensure_dir(self, dirname): if not exists(dirname): print("Creating directory: %s" % dirname) mkdir(dirname) def save_images(self, images, base_filename, flattened=True): self.ensure_dir(self.dirname) for i in self.saved_image_indices: label = self.labels[i].lower().replace(" ", "_") image = images[i, :].copy() if flattened: image = image.reshape(self.image_shape) image[np.isnan(image)] = 0 figure = pylab.gcf() axes = pylab.gca() extra_kwargs = {} if self.color: extra_kwargs["cmap"] = "gray" assert image.min() >= 0, "Image can't contain negative numbers" if image.max() <= 1: image *= 256 image[image > 255] = 255 axes.imshow(image.astype("uint8"), **extra_kwargs) axes.get_xaxis().set_visible(False) axes.get_yaxis().set_visible(False) filename = base_filename + ".png" subdir = join(self.dirname, label) self.ensure_dir(subdir) path = join(subdir, filename) figure.savefig( path, bbox_inches='tight') self.saved_images[i][base_filename] = path def add_entry(self, solver, name): print("Running %s" % name) completed_normalized = solver.complete(self.incomplete_normalized) completed = self.normalizer.inverse_transform(completed_normalized) mae = masked_mae( X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) mse = masked_mse( X_true=self.flattened_images, X_pred=completed, mask=self.missing_mask) print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae)) self.mse_dict[name] = mse self.mae_dict[name] = mae self.save_images(completed, base_filename=name) def sorted_errors(self): """ Generator for (rank, name, MSE, MAE) sorted by increasing MAE """ for i, (name, mae) in enumerate( sorted(self.mae_dict.items(), key=lambda x: x[1])): yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],) def print_sorted_errors(self): for (rank, name, mse, mae) in self.sorted_errors(): print("%d) %s: MSE=%0.4f MAE=%0.4f" % ( rank, name, mse, mae)) def save_html_table(self, filename="results_table.html"): html = """ <table> <th> <td>Rank</td> <td>Name</td> <td>Mean Squared Error</td> <td>Mean Absolute Error</td> </th> """ for (rank, name, mse, mae) in self.sorted_errors(): html += """ <tr> <td>%d</td> <td>%s</td> <td>%0.4f</td> <td>%0.4f</td> </tr> """ % (rank, name, mse, mae) html += "</table>" self.ensure_dir(self.dirname) path = join(self.dirname, filename) with open(path, "w") as f: f.write(html) return html
random.seed(123) np.random.seed(123) # read in data and transpose data = pd.read_csv(input_file, sep='\t', header=0, index_col=0, error_bad_lines=False) new_data = data.copy() transposed = new_data.T # we'll need a matrix specifically for the biscaler transform, for SoftImpute print("SoftImpute...") transposed_mat = transposed.as_matrix() biscaler = BiScaler() # perform the scaling appropriate for this imputation strategy transposed_normalized = biscaler.fit_transform(transposed_mat) # the imputation itself imputed_softimpute = SoftImpute().fit_transform(transposed_normalized) # we don't want the transformed values and we want samples to be columns inverse_softimpute = biscaler.inverse_transform(imputed_softimpute) untransposed_softimpute = inverse_softimpute.transpose() # prepare to write to file, back to DataFrame, return indices softimpute_df = pd.DataFrame(untransposed_softimpute) softimpute_df.index = data.index softimpute_df.columns = data.columns.values # write to a tab separated values file, but we'll use the .pcl file extension softimpute_outfile = outfile + "_softimpute.pcl"
X_filled_knn = knnImpute.complete(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean() print("meanFill MSE: %f" % meanfill_mse) # print mean squared error for the three imputation methods above nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() print("Nuclear norm minimization MSE: %f" % nnm_mse) softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean() print("SoftImpute MSE: %f" % softImpute_mse)