def display_reconstruction(self, data, reconstruction): self.eval() print("GENERATING RECONSTRUCTION IMAGES autoencoder!") hparams_string = "/".join([ "num_elements" + str(self.num_elements), "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last), "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder), self.flavour ]) x = data.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data x_grid = tv.utils.make_grid(x) x_recon = reconstruction.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data x_recon_grid = tv.utils.make_grid(x_recon) images_path = self.images_path + hparams_string + "/recon/" + self.prior_dist + "/" print("Images location:", images_path) create_missing_folders(images_path) tv.utils.save_image( x_grid, images_path + "original_" + str(self.epoch) + ".png") tv.utils.save_image( x_recon_grid, images_path + "reconstruction_example_" + str(self.epoch) + ".png")
def generate_random(self, max=1000): self.eval() print("GENERATING RANDOM IMAGES autoencoder!") hparams_string = "/".join([ "num_elements" + str(self.num_elements), "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last), "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder), self.flavour, self.prior_dist ]) images_path = self.images_path + hparams_string + "/generated_random/" + self.prior_dist + "/" create_missing_folders(images_path) rand_z = torch.randn(self.batch_size, self.z_dim_last).cuda() self.plot_z_stats(rand_z.detach().cpu().numpy(), generate="/random_generated/" + self.prior_dist + "/", path=images_path, max=max) new_x = self.sample(rand_z) if len(self.input_shape) > 1: images = new_x.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data images_grid = tv.utils.make_grid(images) print("Images location:", images_path) tv.utils.save_image( images_grid, images_path + str(self.epoch) + self.dataset_name + "generated.png") del images_grid, images del rand_z, new_x, images_path, hparams_string
def loadGEO(self, geo_id='GSE22845', dataframes_folder="dataframes"): """ :param geo_id: :return: Example: from debug.get_parameters import * dataframes_folder = "/Users/simonpelletier/data/hebbian_learning_ann" g = geoParser(destination_folder=data_destination) g.getGEO(geo_ids,is_load_from_disk) """ import pandas as pd import numpy as np flag = False print('Loading ' + geo_id + ' ...') self.df_file_name = geo_id + '_dataframe.pickle.npy' create_missing_folders(self.dataframes_path) current_directory_list = os.listdir(self.dataframes_path) if self.df_file_name in current_directory_list: print("File found at location:",self.data_folder_path + "/" + self.df_file_name) self.df[geo_id] = pd.read_pickle(self.dataframes_path+"/"+self.df_file_name) if sum(sum(np.isnan(self.df[geo_id].values)).tolist()) > 0: print("Nans found. They are all replaced by 0") self.df[geo_id][np.isnan(self.df[geo_id] )] = 0 else: print(self.df_file_name ,' NOT FOUND in ', self.dataframes_path) print(current_directory_list) flag = True return flag
def plot_performance(loss_total, accuracy, labels, results_path, filename="NoName", verbose=0, std_loss=None, std_accuracy=None): """ :param loss_total: :param loss_labelled: :param loss_unlabelled: :param accuracy: :param labels: :param results_path: :param filename: :param verbose: :return: """ fig2, ax21 = plt.subplots() n = list(range(len(accuracy["train"]))) try: ax21.plot(loss_total["train"], 'b-', label='Train total loss:' + str(len(labels["train"]))) # plotting t, a separately ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:' + str(len(labels["valid"]))) # plotting t, a separately #ax21.plot(values["valid"], 'r-', label='Test:' + str(len(labels["valid"]))) # plotting t, a separately except: ax21.plot(loss_total["train"], 'b-', label='Train total loss:') # plotting t, a separately ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:') # plotting t, a separately if std_accuracy is not None: ax21.errorbar(x=n, y=loss_total["train"], yerr=[np.array(std_loss["train"]), np.array(std_loss["train"])], c="b", label='Train') # plotting t, a separately if std_accuracy is not None: ax21.errorbar(x=n, y=loss_total["valid"], yerr=[np.array(std_loss["valid"]), np.array(std_loss["valid"])], c="g", label='Valid') # plotting t, a separately ax21.set_xlabel('epochs') ax21.set_ylabel('Loss') handles, labels = ax21.get_legend_handles_labels() ax21.legend(handles, labels) ax22 = ax21.twinx() #colors = ["b", "g", "r", "c", "m", "y", "k"] # if n_list is not None: # for i, n in enumerate(n_list): # ax22.plot(n_list[i], '--', label="Hidden Layer " + str(i)) # plotting t, a separately ax22.set_ylabel('Accuracy') ax22.plot(accuracy["train"], 'c--', label='Train') # plotting t, a separately ax22.plot(accuracy["valid"], 'k--', label='Valid') # plotting t, a separately if std_accuracy is not None: ax22.errorbar(x=n, y=accuracy["train"], yerr=[np.array(std_accuracy["train"]), np.array(std_accuracy["train"])], c="c", label='Train') # plotting t, a separately if std_accuracy is not None: ax22.errorbar(x=n, y=accuracy["valid"], yerr=[np.array(std_accuracy["valid"]), np.array(std_accuracy["valid"])], c="k", label='Valid') # plotting t, a separately handles, labels = ax22.get_legend_handles_labels() ax22.legend(handles, labels) fig2.tight_layout() # pylab.show() if verbose > 0: print("Performance at ", results_path) create_missing_folders(results_path + "/plots/") pylab.savefig(results_path + "/plots/" + filename) plt.show() plt.close()
def generate_uniform_gaussian_percentiles(self, epoch=0, verbose=0, show_pca=0, show_lda=0, n=20, drop_na=False): zs_grid = torch.stack([torch.Tensor(np.vstack([np.linspace(norm.ppf(0.05), norm.ppf(0.95), n**2) for _ in range(self.z_dim_last)]).T) for _ in range(self.num_classes)]) # I get much better results squeezing values with tanh hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows), "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr), "ladder"+str(self.ladder), self.flavour, "n_labelled"+str(len(self.train_loader))]) images_path = self.results_path + "/" + hparams_string + "/gaussian_percentiles/" if verbose > 0: print("GENERATING SS DGM IMAGES AT", images_path) y = torch.stack([torch.Tensor(onehot_array(n**2*[i], self.num_classes)) for i in range(n)]) x_mu = [self.sample(torch.Tensor(zs_grid[i]).cuda(), y[i]) for i in range(self.num_classes)] # plot_z_stats(rand_z.detach().cpu().numpy(), generate="generated") labels_set_ints = list(range(len(self.labels_set))) if len(self.input_shape) > 1: images = torch.stack([x_mu[i].view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data for i in range(len(x_mu))]) images = images.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]) images_grid = tv.utils.make_grid(images, n) create_missing_folders(images_path) tv.utils.save_image(images_grid, images_path + "/" + str(epoch) + "gaussian_percentiles_generated.png")
def save_model(self): # SAVING print("MODEL SAVED AT LOCATION:", self.model_history_path) create_missing_folders(self.model_history_path) torch.save( self.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name + '.state_dict') if self.ssl: torch.save( self.classifier.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name + 'classifier.state_dict') torch.save( self.train_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_loss') torch.save( self.train_rec_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_re') torch.save( self.train_kl_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_kl') torch.save( self.val_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.val_loss') torch.save( self.val_rec_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.val_re') torch.save( self.val_kl_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.val_kl') torch.save( self.epoch, self.model_history_path + self.flavour + "_" + self.model_file_name + '.epoch')
def plot_losses(losses, labels, n_neurons=None, results_path="~", filename="NoName"): filename = "_".join([filename, "loss.png"]) create_missing_folders(results_path + "/plots/hnet/") fig, ax1 = plt.subplots() plt.ylim([0., 1000.]) ax1.plot(losses, 'g-.', label='train') # plotting t, a separately ax1.set_xlabel('epochs') ax1.set_ylabel('Loss') # ax1.tick_params('y') handles, labels = ax1.get_legend_handles_labels() ax1.legend(handles, labels) if n_neurons is not None: ax22 = ax1.twinx() for i, n in enumerate(n_neurons): ax22.plot(n_neurons[i], '--', label="Hidden Layer " + str(i)) # plotting t, a separately ax22.set_ylabel('#Neurons') handles, labels = ax22.get_legend_handles_labels() ax22.legend(handles, labels) fig.tight_layout() # pylab.show() pylab.savefig(results_path + "/plots/hnet/" + filename) plt.close()
def input_pruning(self, results_path, min_n_input_dims=20, minimum_neurons=20): """ :param net: :param gt: :param min_n_input_dims: :param minimum_neurons: :return: """ self.eval() with torch.no_grad(): hebb_input = self.hebb_input_values.data.copy_(self.hebb_input_values.data).cpu().numpy() if len(hebb_input) >= min_n_input_dims: to_keep = hebb_input > float(self.gt_input) notTooUsed = hebb_input < float(self.lt_input) print("min_hebb_value:", self.gt_input) valid_indices = indices_h(to_keep) valid_indices_down = indices_h(notTooUsed) total_valid = np.intersect1d(valid_indices, valid_indices_down) if len(valid_indices) < minimum_neurons: # TODO Replace neurons that could not be removed? valid_indices = indices_h(torch.sort(hebb_input)[1] < minimum_neurons) print("Minimum neurons on layer 1", sep="\t", file=self.hebb_log) print("previous_valid_len", self.previous_valid_len) self.valid_bool = [1. if x in valid_indices else 0. for x in range(self.input_size)] self.valid_bool_down = [1. if x in valid_indices_down else 0. for x in range(self.input_size)] self.valid_bool_total = [1. if x in total_valid else 0. for x in range(self.input_size)] self.alive_inputs = [x for x in range(len(hebb_input)) if x in valid_indices] self.alive_inputs_down = [x for x in range(len(hebb_input)) if x in valid_indices_down] self.alive_inputs_total = [x for x in range(len(hebb_input)) if x in total_valid] alive_inputs = np.array(self.alive_inputs) #if len(self.alive_inputs) < self.previous_valid_len: masks_path = results_path + "/images/masks/" + str(self.dataset_name) + "/" create_missing_folders(masks_path) img_path = "_".join(["alive_inputs", str(len(valid_indices_down)), str(self.epoch), "down.png"]) print("self.n_channels", self.n_channels) if len(self.input_shape) == 3: print("SAVING MASK at", results_path) mask = np.reshape(self.valid_bool_down, newshape=(28, 28)) # TODO change hard coding plt.imsave(masks_path + img_path, mask) img_path = "_".join(["alive_inputs", str(len(total_valid)), str(self.epoch), "total.png"]) print("self.n_channels", self.n_channels) if len(self.input_shape) == 3: print("SAVING MASK at", results_path) mask = np.reshape(self.valid_bool_total, newshape=(28, 28)) # TODO change hard coding plt.imsave(masks_path + img_path, mask) img_path = "_".join(["alive_inputs", str(len(valid_indices)), str(self.epoch), "up.png"]) print("self.n_channels", self.n_channels) if len(self.input_shape) == 3: print("SAVING MASK at", results_path) mask = np.reshape(self.valid_bool, newshape=(28, 28)) # TODO change hard coding plt.imsave(masks_path + img_path, mask) self.previous_valid_len = len(valid_indices) self.valid_bool_tensor = self.valid_bool_tensor * torch.Tensor(self.valid_bool).cuda() return self.valid_bool, self.alive_inputs
def calculate_losses(self, data, lambda1=0., lambda2=0., beta=1., likelihood=F.mse_loss): if self.ladder: ladder = "ladder" else: ladder = "not_ladder" self.images_path = self.results_path + "/images/examples/generative/" + ladder + "/" + self.flavour + "/" create_missing_folders(self.images_path) data = torch.tanh(data) if self.flow_type in ["o-sylvester", "t-sylvester", "h-sylvester" ] and not self.ladder: z_q = {0: None, 1: None} reconstruction, mu, log_var, self.log_det_j, z_q[0], z_q[ -1] = self.run_sylvester(data, auxiliary=self.auxiliary) log_p_zk = log_standard_gaussian(z_q[-1]) # ln q(z_0) (not averaged) # mu, log_var, r1, r2, q, b = q_param_inverse log_q_z0 = log_gaussian(z_q[0], mu, log_var=log_var) - self.log_det_j # N E_q0[ ln q(z_0) - ln p(z_k) ] self.kl_divergence = log_q_z0 - log_p_zk del log_q_z0, log_p_zk else: reconstruction, z_q = self(data) kl = beta * self.kl_divergence likelihood = torch.sum(likelihood(reconstruction, data.float(), reduce=False), dim=-1) if self.ladder: params = torch.cat( [x.view(-1) for x in self.reconstruction.parameters()]) else: params = torch.cat( [x.view(-1) for x in self.decoder.reconstruction.parameters()]) l1_regularization = lambda1 * torch.norm(params, 1).cuda() l2_regularization = lambda2 * torch.norm(params, 2).cuda() try: assert l1_regularization >= 0. and l2_regularization >= 0. except: print(l1_regularization, l2_regularization) loss = torch.mean(likelihood + kl.cuda() + l1_regularization + l2_regularization) del data, params, l1_regularization, l2_regularization, lambda1, lambda2 return loss, torch.mean(likelihood), torch.mean( kl), reconstruction, z_q
def load_geo(self, geo_id, labelled, bad_example=False): """ :param geo_id: :return: Example: from debug.get_parameters import * dataframes_folder = "/Users/simonpelletier/data/annleukemia" g = GeoParser(destination_folder=data_destination) g.get_geo(geo_ids,load_from_disk) """ flag = False print('Loading ' + geo_id + ", labelled: " + str(labelled) + ' ...') if not bad_example: self.df_file_name = geo_id + "_labelled" + str( labelled) + '_dataframe.pickle.npy' else: self.df_file_name = geo_id + "_labelled" + str( labelled) + '_bad_dataframe.pickle.npy' create_missing_folders(self.dataframes_path) current_directory_list = os.listdir(self.dataframes_path) if self.df_file_name in current_directory_list: print("File found at location:", self.data_folder_path + "/" + self.df_file_name) if labelled or bad_example: self.df[geo_id] = pd.read_pickle(self.dataframes_path + "/" + self.df_file_name) if sum(sum(np.isnan(self.df[geo_id].values)).tolist()) > 0: print("Nans found. They are all replaced by 0") self.df[geo_id][np.isnan(self.df[geo_id])] = 0 print("self.df[geo_id]", self.df[geo_id].shape) else: self.unlabelled_df[geo_id] = pd.read_pickle( self.dataframes_path + "/" + self.df_file_name) if sum( sum(np.isnan( self.unlabelled_df[geo_id].values)).tolist()) > 0: print("Nans found. They are all replaced by 0") self.unlabelled_df[geo_id][np.isnan( self.unlabelled_df[geo_id])] = 0 print("self.unlabelled_df[geo_id]", self.unlabelled_df[geo_id].shape) else: print(self.df_file_name, ' NOT FOUND in ', self.dataframes_path) flag = True return flag
def histograms_hidden_layers(xs, results_path, normalized, is_mean=True, epoch=0, depth=0, activated=False, mu=None, var=None, axis=0, bins=50, flat=True, neuron=None): ax = plt.subplot(111) ax.set_xlabel("Hidden value") ax.set_ylabel("Frequency") plt.title("PDF of preactivation values") if neuron is None: neurons = "all" else: neurons = "single" xs = xs[:, neuron] if is_mean: xs = np.mean(xs, axis=axis) ax.hist(xs, bins=bins, alpha=0.5, density=True) if mu is None and var is None: mean_mean = float(np.mean(xs)) mean_var = float(np.var(xs)) elif mu is not None and var is not None: mean_mean = float(mu) mean_var = float(var) else: print( "No images saved. Both mu and var must be either None or both numpy" ) return normal_curve(ax, mean_mean, mean_var) if activated: plt.axvline(x=float(np.mean(xs)), c="g", linewidth=1) # half_normal_curve(ax, mu, var, float(np.mean(xs))) destination_folder_path = "/".join( (results_path, "layers_histograms", "depth_" + str(depth), "activated_" + str(activated), "normalized_" + str(normalized))) + "/" create_missing_folders(destination_folder_path) destination_file_path = destination_folder_path + "Hidden_values_hist_" + str(epoch) + "_activated"+ \ str(activated) + "_normalized" + str(normalized) + "_mean" + str(is_mean) + "_flat"\ + str(flat) + "_" + neurons + "neurons.png" plt.savefig(destination_file_path) plt.close()
def ordination2d( data_frame, ORD=PCA, images_folder_path="/home/simon/results/hebbian_learning_ann/plots/", filenames="NoName", a=0.5): type_images_folder_path = images_folder_path + filenames + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) return y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) ord = ORD(n_components=2, verbose=1) principalComponents = ord.fit_transform(np.transpose(data_frame.values)) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component tSNE', fontsize=20) colors = ['r', 'g', 'b'] for target, color in zip(classes_list, colors): indicesToKeep = finalDf[0] == target data1 = finalDf.loc[indicesToKeep, 'principal component 1'] data2 = finalDf.loc[indicesToKeep, 'principal component 2'] ellipse_data(data1, data2, ax, color) ax.scatter(data1, data2, c=color, s=12) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) plt.tight_layout() fig.tight_layout() fig.savefig(images_folder_path + type_ord + filenames + ".png") plt.close(fig)
def generate_random(self, epoch=0, verbose=0, show_pca=1, show_lda=1, n=40, drop_na=False, keep_images=True, only_na=False): hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows), "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr), "ladder"+str(self.ladder), self.flavour]) images_path = self.results_path + "/" + hparams_string + "/random/" create_missing_folders(images_path) if verbose > 0: print("GENERATING IMAGES AT", images_path) self.eval() rand_z = Variable(torch.randn(n*self.num_classes, self.z_dim)) if not only_na: y = torch.cat([torch.Tensor(onehot_array(n*[i], self.num_classes)) for i in range(self.num_classes)]) else: y = torch.cat(torch.Tensor(onehot_array(n*[self.num_classes], self.num_classes))) rand_z, y = rand_z.cuda(), y.cuda() x_mu = self.sample(rand_z, y) # plot_z_stats(rand_z.detach().cpu().numpy(), generate="generated") if len(self.input_shape) > 1 and keep_images: images = x_mu.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data images_grid = tv.utils.make_grid(images, 20) tv.utils.save_image(images_grid, images_path + "/" + str(epoch) + "only_na:" + str(only_na) + "_generated.png") colnames = [list(self.labels_set)[one_hot.cpu().numpy().tolist().index(1)] for one_hot in y] df = pd.DataFrame(x_mu.transpose(1, 0).detach().cpu().numpy(), columns=colnames) if drop_na: try: df = df.drop(["N/A"], axis=1) except: pass if show_pca != 0 and epoch % show_pca == 0 and epoch != 0: try: ordination2d(df, "pca", epoch=self.epoch, images_folder_path=images_path, dataset_name=self.dataset_name, a=0.5, verbose=0, info="generated") except: print("No pca.") if show_lda != 0 and epoch % show_lda == 0 and epoch != 0: try: ordination2d(df, "lda", epoch=self.epoch, images_folder_path=images_path, dataset_name=self.dataset_name, a=0.5, verbose=0, info="generated") except: print("NO lda") del df, colnames, images_grid, x_mu, rand_z, y return images
def QDA(data_frame, images_folder_path, dataset_name, epoch, a=0.5, verbose=0, info="none", show_images=True): import pandas as pd import numpy as np try: assert type(data_frame) == pd.core.frame.DataFrame except: print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)") print(type(data_frame)) exit() return if type(dataset_name) == list: names = [name for name in dataset_name] dataset_name = "_".join(names) y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) data_frame.values[np.isnan(data_frame.values)] = 0 X = np.transpose(data_frame.values) # Linear Discriminant Analysis lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) y_pred = lda.fit(X, y).predict(X) splot = plot_data(lda, X, y, y_pred) plot_lda_cov(lda, splot) plt.axis('tight') # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis(store_covariance=True) y_pred = qda.fit(X, y).predict(X) splot = plot_data(qda, X, y, y_pred) plot_qda_cov(qda, splot) plt.axis('tight') plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant' 'Analysis') try: plt.tight_layout() except: pass type_images_folder_path = "/".join([images_folder_path, str(dataset_name)]) + "/" type_images_folder_path = type_images_folder_path + info + "/" create_missing_folders(type_images_folder_path) plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100) if show_images: plt.show() plt.close()
def display_reconstruction(self, epoch, data, reconstruction, display_rate=1): hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows), "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr), "ladder"+str(self.ladder), self.flavour]) images_path = self.results_path + "/" + hparams_string + "/reconstruction/" create_missing_folders(images_path) x = data.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data x_grid = tv.utils.make_grid(x) x_recon = reconstruction.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data x_recon_grid = tv.utils.make_grid(x_recon) if epoch % display_rate == 0: print("GENERATING RECONSTRUCTION IMAGES autoencoder!") tv.utils.save_image(x_grid, images_path + str(epoch) + "_original.png") tv.utils.save_image(x_recon_grid, images_path + str(epoch) + "_reconstruction_example.png")
def plot_z_stats(self, z, path, generate="generated", max=5000): fig, ax = plt.subplots() # create figure and axis plt.boxplot(z) handles, labels = ax.get_legend_handles_labels() ax.legend(handles, labels) plt.tight_layout() fig.tight_layout() path = "/".join([path, "plots/vae_z_stats", generate]) + "/" create_missing_folders(path) fig.savefig(path + self.flavour + "_" + str(self.epoch) + '_lr' + str(self.lr) + '_bs' + str(self.batch_size) + ".png") plt.close(fig) if z.shape[1] == 2: self.plot_z(generated=generate) del z, path, generate
def save_model(self): # SAVING print("MODEL (with classifier) SAVED AT LOCATION:", self.model_history_path) create_missing_folders(self.model_history_path) torch.save(self.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name +'.state_dict') torch.save(self.classifier.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name +'classifier.state_dict') torch.save(self.train_total_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_total_loss') torch.save(self.train_labelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_labelled_loss') torch.save(self.train_unlabelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_unlabelled_loss') torch.save(self.train_accuracy_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_accuracy') torch.save(self.train_kld_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_kld') torch.save(self.valid_total_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_total_loss') torch.save(self.valid_labelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_labelled_loss') torch.save(self.valid_unlabelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_unlabelled_loss') torch.save(self.valid_accuracy_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_accuracy') torch.save(self.valid_kld_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_kld') torch.save(self.epoch, self.model_history_path + self.flavour + "_" + self.model_file_name + '.epoch')
def plot_performance(loss_total, loss_labelled, loss_unlabelled, accuracy, labels, results_path, filename="NoName", verbose=0): fig2, ax21 = plt.subplots() try: ax21.plot(loss_total["train"], 'b-', label='Train total loss:' + str(len(labels["train"]))) # plotting t, a separately ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:' + str(len(labels["valid"]))) # plotting t, a separately ax21.plot(loss_labelled["train"], 'b-.', label='Train labelled loss:' + str(len(labels["train"]))) # plotting t, a separately ax21.plot(loss_labelled["valid"], 'g-.', label='Valid labelled loss:' + str(len(labels["valid"]))) # plotting t, a separately ax21.plot(loss_unlabelled["train"], 'b.', label='Train unlabelled loss:' + str(len(labels["train"]))) # plotting t, a separately ax21.plot(loss_unlabelled["valid"], 'g.', label='Valid unlabelled loss:' + str(len(labels["valid"]))) # plotting t, a separately #ax21.plot(values["valid"], 'r-', label='Test:' + str(len(labels["valid"]))) # plotting t, a separately except: ax21.plot(loss_total["train"], 'b-', label='Train total loss:') # plotting t, a separately ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:') # plotting t, a separately ax21.plot(loss_labelled["train"], 'b-.', label='Train labelled loss:') # plotting t, a separately ax21.plot(loss_labelled["valid"], 'g-.', label='Valid labelled loss:') # plotting t, a separately ax21.plot(loss_unlabelled["train"], 'b.', label='Train unlabelled loss:') # plotting t, a separately ax21.plot(loss_unlabelled["valid"], 'g.', label='Valid unlabelled loss:') # plotting t, a separately ax21.set_xlabel('epochs') ax21.set_ylabel('Loss') handles, labels = ax21.get_legend_handles_labels() ax21.legend(handles, labels) ax22 = ax21.twinx() #colors = ["b", "g", "r", "c", "m", "y", "k"] # if n_list is not None: # for i, n in enumerate(n_list): # ax22.plot(n_list[i], '--', label="Hidden Layer " + str(i)) # plotting t, a separately ax22.set_ylabel('Accuracy') ax22.plot(accuracy["train"], 'b--', label='Train') # plotting t, a separately ax22.plot(accuracy["valid"], 'g--', label='Valid') # plotting t, a separately handles, labels = ax22.get_legend_handles_labels() ax22.legend(handles, labels) fig2.tight_layout() # pylab.show() if verbose > 0: print("Performance at ", results_path) create_missing_folders(results_path + "/plots/") pylab.savefig(results_path + "/plots/" + filename) plt.show() plt.close()
def generate_uniform_gaussian_percentiles(self, n=20, verbose=1, max=1000): self.eval() print("GENERATING gaussian percentiles IMAGES autoencoder!") xs_grid = torch.Tensor( np.vstack([ np.linspace(norm.ppf(0.01), norm.ppf(0.99), n**2) for _ in range(self.z_dim_last) ]).T) hparams_string = "/".join([ "num_elements" + str(self.num_elements), "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last), "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder), self.flavour ]) images_path = self.images_path + "/" + hparams_string + "/gaussian_percentiles/" + "/" + self.prior_dist + "/" if verbose > 0: print("GENERATING SS DGM IMAGES AT", images_path) print("image path:", images_path) create_missing_folders(images_path) self.plot_z_stats(xs_grid, generate="/ugp_generated/" + self.prior_dist + "/", path=images_path, max=max) grid = torch.Tensor(xs_grid).to(device) new_x = torch.stack([self.sample(g.view(1, -1)) for g in grid]) if len(self.input_shape) > 1: images = new_x.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data assert n == int(images.shape[0]) / n images_grid = tv.utils.make_grid(images, int(np.sqrt(images.shape[0]))) create_missing_folders(images_path) tv.utils.save_image( images_grid, images_path + str(self.epoch) + self.dataset_name + "gaussian_uniform_generated.png") del images_grid, images, new_x, xs_grid
def translate(self, geo_id, labelled, old_ids='entrezgene_trans_name', new_ids='uniprot_gn', load=True): import subprocess translation_destination = "/".join( [self.data_folder_path, "translation_results"]) + "/" self.translation_destination = translation_destination dictionary_path = self.dictionary_path = "/".join( [self.data_folder_path, "dictionaries"]) create_missing_folders(translation_destination) create_missing_folders(dictionary_path) filename = geo_id + "_" + old_ids + '.txt' output_file = geo_id + "_" + old_ids + "2" + new_ids + ".txt" output_path = translation_destination + "/" + output_file if filename not in os.listdir( translation_destination) or load is False: print("new file in" + translation_destination) f = open(translation_destination + "/" + filename, "w") for id in list(self.meta_df[labelled].index): f.write(str(id) + "\n") f.close() if output_file not in os.listdir(translation_destination): print("Translating", geo_id, "from", old_ids, "to", new_ids, "...") call = [ "./biomart_api.R", translation_destination, geo_id, old_ids, new_ids ] subprocess.call(call) else: print("The file", output_file, "was found in", translation_destination) file = open(output_path, "r") names_translations = np.loadtxt(file, dtype=str, delimiter=";") try: assert len(names_translations) > 0 except: print("There is not translation to show") return names_translations
def translate(self, geo_id, old_ids='refseq_mrna', new_ids='uniprot_gn', load=False): import subprocess translation_destination = "/".join([self.data_folder_path, "translation_results"]) + "/" self.translation_destination = translation_destination dictionary_path = self.dictionary_path = "/".join([self.data_folder_path, "dictionaries"]) create_missing_folders(translation_destination) create_missing_folders(dictionary_path) filename = geo_id + "_" + old_ids + '.txt' try: os.remove(translation_destination + "/" + filename) except OSError: pass os.mknod(translation_destination + "/" + filename) f = open(translation_destination + "/" + filename, "w") for id in list(self.meta_df.index): f.write(id+"\n") f.close() translation_filename = old_ids + "2" + new_ids + "_" + geo_id + ".txt.npy" self.translation_results[geo_id] = translation_destination create_missing_folders(self.translation_results[geo_id]) if translation_filename not in translation_destination or load is False: print("Translating", geo_id, "from", old_ids, "to", new_ids, "...") call = ["./biomart_api.R", translation_destination, geo_id, old_ids, new_ids] subprocess.call(call) output_file = translation_destination + "/" + geo_id + "_" +old_ids+"2"+new_ids+".txt" file = open(output_file, "r") names_translated = file.readlines() print(names_translated)
def plot_performance(values, labels, n_list=None, results_path="~", filename="NoName"): fig2, ax21 = plt.subplots() ax21.plot(values["train"], 'b-', label='Train:' + str(len(labels["train"]))) # plotting t, a separately ax21.plot(values["valid"], 'g-', label='Valid:' + str(len(labels["valid"]))) # plotting t, a separately ax21.plot(values["valid"], 'r-', label='Test:' + str(len(labels["valid"]))) # plotting t, a separately ax21.set_xlabel('epochs') ax21.set_ylabel('Accuracy') handles, labels = ax21.get_legend_handles_labels() ax21.legend(handles, labels) ax22 = ax21.twinx() #colors = ["b", "g", "r", "c", "m", "y", "k"] if n_list is not None: for i, n in enumerate(n_list): ax22.plot(n_list[i], '--', label="Hidden Layer " + str(i)) # plotting t, a separately ax22.set_ylabel('#Neurons') handles, labels = ax22.get_legend_handles_labels() ax22.legend(handles, labels) fig2.tight_layout() # pylab.show() pylab.savefig(results_path + "/plots/hnet/" + filename) create_missing_folders(results_path + "/plots/hnet/") plt.close()
def set_configs(self, home_path, results_folder="results", data_folder="data", destination_folder="hebbian_learning_ann", dataset_name="GSE33000", meta_destination_folder="meta_pandas_dataframes", csv_filename="csv_loggers", lr=1e-3): # Hyper-parameters self.lr = lr # Files names self.dataset_name = dataset_name self.filename = dataset_name + '_history' self.csv_filename = csv_filename # Folder names self.results_folder = results_folder self.destination_folder = destination_folder self.data_folder = data_folder self.meta_destination_folder = meta_destination_folder # Paths self.home_path = home_path self.results_path = "/".join( [self.home_path, self.destination_folder, self.results_folder]) self.models_path = "/".join([self.results_path, "models"]) self.model_history_path = self.models_path + "/history/" self.csv_logger_path = "/".join([self.results_path, csv_filename]) self.data_folder_path = "/".join( [home_path, self.destination_folder, self.data_folder]) self.meta_destination_path = "/".join( [self.data_folder_path, self.meta_destination_folder]) create_missing_folders(self.csv_logger_path) create_missing_folders(self.models_path) create_missing_folders(self.meta_destination_path) create_missing_folders(self.model_history_path) # Empty lists self.accuracy_training_array = [] self.accuracy_valid_array = [] self.losses_training_array = [] self.losses_valid_array = [] self.max_valid_accuracies = [] self.max_valid_epochs = [] self.min_valid_loss = [] self.min_valid_loss_epochs = [] # empty objects self.model = None self.x_test = None self.y_test = None self.x_train = None self.y_train = None self.meta_df = None self.epoch = 0 self.num_classes = None self.init = None self.batch_size = None self.nrep = None self.classes_train = None self.classes_test = None
def ordination2d(data_frame, ord_type, images_folder_path, dataset_name, epoch, a=0.4, verbose=0, info="none", show_images=True, df_valid=None, df_test=None, n=4): import pandas as pd import numpy as np pc1 = 'Component_1' pc2 = 'Component_2' type_images_folder_path = "/".join( [images_folder_path, str(ord_type), str(dataset_name)]) + "/" type_images_folder_path = type_images_folder_path + info + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) print(type(data_frame)) exit() return if type(dataset_name) == list: names = [name for name in dataset_name] dataset_name = "_".join(names) y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) data_frame.values[np.isnan(data_frame.values)] = 0 ord = None ys = False if ord_type in ["pca", "PCA"]: ys = False ord = PCA(n_components=2) elif ord_type in ["kpca", "KPCA"]: ys = False ord = KernelPCA(n_components=2, kernel="rbf") elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]: ys = False ord = TSNE(n_components=2, verbose=verbose) elif ord_type in ["lda", "LDA", "flda", "FLDA"]: ys = True ord = LDA(n_components=2) elif ord_type in ["qda", "QDA"]: ord = QDA() ys = True else: print(ord_type) exit("No ordination of that name is implemented. Exiting...") if ys: principal_components = ord.fit_transform(np.transpose( data_frame.values), y=y) if df_valid is not None: pcs_valid = ord.transform(df_valid.values) pcs_valid = pd.DataFrame( data=pcs_valid, columns=['principal component 1', 'principal component 2']) y_valid = df_valid.columns pcs_valid = pd.concat([pcs_valid, pd.DataFrame(y_valid)], axis=1) pcs_test = ord.transform(df_test.values) pcs_test = pd.DataFrame( data=pcs_test, columns=['principal component 1', 'principal component 2']) y_test = df_valid.columns pcs_test = ord.transform(pcs_test.values) pcs_test = pd.concat([pcs_test, pd.DataFrame(y_test)], axis=1) else: principal_components = ord.fit_transform( np.transpose(data_frame.values)) if ord_type == "pca": ev = ord.explained_variance_ratio_ means = ord.mean_ if sum(means < 0): means = means - min(means) means_ratio = means / np.sum(np.sum(means, axis=0)) * 100 coeff = np.transpose(ord.components_) order_importance = list(reversed(np.argsort(means))) coeff, means_ratio = coeff[order_importance], means_ratio[ order_importance] factors = np.array(data_frame.index)[order_importance] x = list(range(len(factors))) plt.xlabel("Initial Features") plt.ylabel("% of varaince explained") plt.title( "% of the variance is explained by the initial features (Total:" + str(np.round(np.sum(ev) * 100, 2)) + ")") plt.xticks([x[0]], [factors[0]], rotation=45, fontsize=8) plt.plot(means_ratio) plt.tight_layout() plt.savefig(type_images_folder_path + info + "_" + str(epoch) + "_var_exaplined_2D.png", dpi=100) print("plot at ", type_images_folder_path) principal_df = pd.DataFrame( data=principal_components, columns=['principal component 1', 'principal component 2']) final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) if ord_type not in "kpca": ev = ord.explained_variance_ratio_ if len(ev) > 1: pc1 = pc1 + ': ' + str(np.round(ev[0] * 100, decimals=2)) + "%" pc2 = pc2 + ': ' + str(np.round(ev[1] * 100, decimals=2)) + "%" ax.set_xlabel(pc1, fontsize=15) ax.set_ylabel(pc2, fontsize=15) ax.set_title('2 component Ordination', fontsize=20) # colors = cm.viridis(np.linspace(0, 1, len(classes_list))) colors = ["g", "b", "k", "r"] print("coeff shape", coeff.shape) if len(coeff) < n: n = len(coeff) for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = final_df.loc[indices_to_keep, 'principal component 1'] data2 = final_df.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum( np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a, c=colors[t]) labels = factors for i in range(n): plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5) if labels is None: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, "Var" + str(i + 1) + str(np.round(means_ratio[i], 2)), color='g', ha='center', va='center') else: plt.text(coeff[i, 0] * 1.15, coeff[i, 1] * 1.15, str(labels[i]) + str(np.round(means_ratio[i], 2)), color='g', ha='center', va='center') ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) if df_valid is not None: for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = pcs_valid.loc[indices_to_keep, 'principal component 1'] data2 = pcs_valid.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum( np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) if df_test is not None: for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = pcs_test.loc[indices_to_keep, 'principal component 1'] data2 = pcs_test.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum( np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=10, alpha=a) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) try: plt.tight_layout() fig.tight_layout() except: pass plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100) if show_images: plt.show() plt.close(fig)
def __init__(self, home_path, geo_ids, unlabelled_geo_ids=None, bad_geo_ids=None, results_folder='results', data_folder='data', destination_folder='annleukemia', dataframes_folder="dataframes", is_translate=True, silent=False): """ # initial_lr because it can change automatically as the epochs go with a SCHEDULER # for example : ReduceLROnPlateau reduces the learning rate (lr) when the results are not improved for # a number of iterations specified by the user. # Advantages: 1- Can start learning very fast and then does fine tuning; - Too high: The accuracy will most likely reach its optimum faster, but might not be as good as with a smaller LR - Too small: The accuracy will most likely reach a better optimum, but might not be quite long (if too low might seem like its not learning. Too low and it might not learn anything at all) Pitfalls: 1- LR reduction too frequent or too large (same problem as LR too SMALL) 2- LR reduction not fast enough or not large enough (same problem as LR too HIGH) Examples: destination_folder = "/Users/simonpelletier/data/annleukemia" initial_lr=1e-3 init="he_uniform" n_epochs=2 batch_size=128, hidden_size = 128 translate=True silent=False """ self.is_translate = is_translate self.silent = silent self.df = {} self.meta_df = {True: None, False: None} self.unlabelled_df = {} self.files_path = {} self.meta_file_path = None self.dataframes_folder = None # DATASETS IDS self.geo_ids = geo_ids # FOLDER NAMES self.data_folder = data_folder self.dataframes_folder = dataframes_folder self.destination_folder = destination_folder self.results_folder = results_folder self.unlabelled_geo_ids = unlabelled_geo_ids self.bad_geo_ids = bad_geo_ids #if self.unlabelled_geo_ids is not None: # self.labelled_dict = dict(zip(zip(geo_ids, unlabelled_geo_ids), # zip([[True] * len(geo_ids)], [[False] * len(unlabelled_geo_ids)]))) # PATHS self.home_path = home_path self.data_folder_path = "/".join( [self.home_path, self.destination_folder, self.data_folder]) + "/" self.results_folder_path = "/".join([ self.home_path, self.destination_folder, self.results_folder ]) + "/" self.dataframes_path = "/".join( [self.data_folder_path, self.dataframes_folder]) + "/" self.translation_dict_path = "/".join( [self.results_folder_path, "dictionaries"]) + "/" self.soft_path = self.data_folder_path + "/softs/" create_missing_folders(self.translation_dict_path) self.translation_results = {} # Hyperparameters #LAST ADDED self.meta_destination_folder = None self.meta_data_folder_path = None
def merge_datasets(self, labelled, fill_missing=True, load_from_disk=False, meta_destination_folder="meta_pandas_dataframes"): """ :param load_from_disk: :param meta_destination_folder: :param fill_missing: if True, the missing rows will be replaced with 0s for all samples of that dataset. The algorythm might be able to do good even without some information. Otherwise, the list might get very small from utils import get_example_datasets, create_missing_folders fill_missing=True geo_ids = ["GSE12417","GSE22845"] g = get_example_datasets(geo_ids, home_path="/home/simon/", load_from_disk=True) g.get_geo(geo_ids, load_from_disk=True) g.merge_datasets(fill_missing=True) """ print("Preparing for merging the selected datasets... labelled:", labelled) import os import pandas as pd import numpy as np if labelled: dataframe = self.df geo_ids = list(self.df.keys()) else: dataframe = self.unlabelled_df geo_ids = list(self.unlabelled_df.keys()) self.meta_destination_folder = meta_destination_folder + "_labelled" + str( labelled) self.meta_data_folder_path = "/".join( [self.data_folder_path, meta_destination_folder]) create_missing_folders(self.meta_data_folder_path) meta_filename = "_".join(geo_ids) + ".pickle.npy" count = 0 n_samples_list = [len(dataframe[geo_id].columns) for geo_id in geo_ids] total_iteration = -n_samples_list[0] meta_file = search_meta_file_name(meta_filename, list_meta_files=os.listdir( self.meta_data_folder_path)) if meta_file is None or load_from_disk is False: for i in range(len(n_samples_list)): for j in range(i, len(n_samples_list)): total_iteration += n_samples_list[j] for g, geo_id in enumerate(geo_ids): print("merging file:", g + 1, "/", len(geo_ids)) if g == 0: meta_df = dataframe[geo_id] else: if fill_missing: meta_df = pd.concat((meta_df, dataframe[geo_id]), axis=1, sort=True) try: assert len(meta_df.index) == len(set(meta_df.index)) except: print("CONTAINS DUPLICATED ROWNAMES") print(meta_df.shape) print("Saving files...") self.meta_filename = meta_filename self.meta_file_path = '/'.join( [self.meta_data_folder_path, self.meta_filename]) self.meta_df[labelled] = meta_df self.meta_df[labelled].to_pickle(self.meta_file_path) # self.meta_df[labelled].to_csv(self.meta_file_path + ".csv") else: print("Loading file...") self.meta_filename = meta_file self.meta_file_path = '/'.join( [self.meta_data_folder_path, self.meta_filename]) self.meta_df[labelled] = pd.read_pickle(self.meta_file_path) print("Merged sets loaded.") return self.meta_df[labelled]
def build_dataframe(self, geo_id, labelled, bad_example, automatic_attribute, save_to_disk=True): """ The labels are found in the metadata of merged object :param save_to_disk: :param geo_id: ID found on NCBI's database EXAMPLE: GSE12417 -> found here -> https://www.ncbi.nlm.nih.gov/Geo/query/acc.cgi?acc=GSE12417 :param save_to_disk (optional): EXAMPLE g = get_example_datasets(geo_ids = ["GSE12417","GSE22845"], home_path="/Users/simonpelletier/", load_from_disk=True) g.get_geo(geo_ids, load_from_disk=load_from_disk) """ create_missing_folders(self.soft_path) gse = Geo.GEOparse.get_GEO(geo=geo_id, destdir=self.soft_path, silent=self.silent) gsm_on_choices = list(gse.gsms[list(gse.gsms.keys())[0]].columns.index) gpl_on_choices = list(gse.gpls[list(gse.gpls.keys())[0]].columns.index) print(str(len(gsm_on_choices)) + " Choices are available for GSM") gsm_on_selection = 0 # gsm_on_selection = get_user_int(gsm_on_choices) gsm_on = gsm_on_choices[gsm_on_selection] print( str(len(gpl_on_choices)) + " Choices are available for GPL. You must select: ") print("1 - An annotation for GPL") print("2 - (optional) The annotation you want the row names to take") gpl_on_selection = 0 # gpl_on_selection = get_user_int(gpl_on_choices) gpl_on = gpl_on_choices[gpl_on_selection] val_selection = None if automatic_attribute is False: val_selection = get_user_int(gpl_on_choices) else: self.attribute = automatic_attribute for attribute in automatic_attribute: try: val_selection = gpl_on_choices.index(attribute) except: pass if val_selection == None: exit("Selection not found " + str(automatic_attribute) + str(gpl_on_choices)) val = gpl_on_choices[val_selection] merged_values = gse.merge_and_average(gse.gpls[next(iter(gse.gpls))], "VALUE", val, gpl_on_choices, gpl_on=gpl_on, gsm_on=gsm_on) merged_values.values[np.isnan(merged_values.values)] = 0 self.merge_len = merged_values.shape[1] if labelled: self.df[geo_id] = merged_values meta_dict = self.make_metadata_matrix(gse, merged_values) labels, meta_dict = self.rename_according_to_metadata( gse, meta_dict) labels = ["".join(label) for label in labels] labels = rename_labels(labels) labels = rename(labels) self.df[geo_id].columns = labels if len(labels) > merged_values.shape[1]: prompt = input( "Duplicates were detected. Do you want to keep only the first labels (only say yes if you are sure, " "or the results could be wrong) [y/n]") print("Labels", len(labels)) print("Labels", merged_values.shape) if prompt == "y": labels = labels[:merged_values.shape[1]] else: exit() else: self.unlabelled_df[geo_id] = merged_values self.unlabelled_df[geo_id].columns = ["no_label"] * len( self.unlabelled_df[geo_id].columns) if save_to_disk: create_missing_folders(path=self.dataframes_path) if not bad_example: self.files_path[ geo_id] = self.dataframes_path + '/' + geo_id + "_labelled" + str( labelled) + '_dataframe' else: self.files_path[ geo_id] = self.dataframes_path + '/' + geo_id + "_labelled" + str( labelled) + '_bad_dataframe' print("Saving to " + self.files_path[geo_id]) if labelled: self.df[geo_id].to_pickle( self.files_path[geo_id] + '.pickle.npy') # Faster to load pickled files #self.df[geo_id].to_csv(self.files_path[geo_id] + '.csv') # For vizualisation else: self.unlabelled_df[geo_id].to_pickle(self.files_path[geo_id] + '.pickle.npy') #self.unlabelled_df[geo_id].to_csv(self.files_path[geo_id] + '.csv') return merged_values
def ordination2d( data_frame, ORD=PCA, images_folder_path="/home/simon/results/hebbian_learning_ann/plots/", filename="pca", a=0.5): import pandas as pd import matplotlib.pyplot as plt import numpy as np type_images_folder_path = images_folder_path + filename + "/" create_missing_folders(type_images_folder_path) try: assert type(data_frame) == pd.core.frame.DataFrame except: print( "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)" ) return y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) pca = ORD(n_components=2) data_frame.values[np.isnan(data_frame.values)] = 0 principalComponents = pca.fit_transform(data_frame.values) principalDf = pd.DataFrame( data=principalComponents, columns=['principal component 1', 'principal component 2']) finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component PCA', fontsize=20) #colors = get_colors() for target in classes_list: indicesToKeep = finalDf[0] == target data1 = finalDf.loc[indicesToKeep, 'principal component 1'] data2 = finalDf.loc[indicesToKeep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum( np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax) ax.scatter(data1, data2, s=20, alpha=a, linewidths=0, edgecolors='none') ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) plt.tight_layout() fig.tight_layout() fig.savefig(type_images_folder_path + "PCA2d" + filename + ".png") plt.close(fig)
def merge_datasets(self, fill_missing=True, is_load_from_disk=True, meta_destination_folder="meta_pandas_dataframes"): """ :param fill_missing: if True, the missing rows will be replaced with 0s for all samples of that dataset. The algorythm might be able to do good even without some information. Otherwise, the list might get very small from utils import get_example_datasets, create_missing_folders fill_missing=True geo_ids = ["GSE12417","GSE22845"] g = get_example_datasets(geo_ids, home_path="/home/simon/", is_load_from_disk=True) g.getGEO(geo_ids, is_load_from_disk=True) g.merge_datasets(fill_missing=True) """ print("Preparing for merging the selected datasets...") import os import pandas as pd import numpy as np self.meta_destination_folder = meta_destination_folder self.meta_data_folder_path = "/".join([self.data_folder_path, meta_destination_folder]) create_missing_folders(self.meta_data_folder_path) meta_filename = "_".join(self.geo_ids) count = 0 n_samples_list = [len(self.df[geo_id].columns) for geo_id in self.geo_ids ] total_iteration = -n_samples_list[0] meta_file = search_meta_file_name(meta_filename,list_meta_files = os.listdir(self.meta_data_folder_path)) if meta_file is None or is_load_from_disk is False: for i in range(len(n_samples_list)): for j in range(i,len(n_samples_list)): total_iteration += n_samples_list[j] for g,geo_id in enumerate(self.geo_ids): if g == 0: meta_df = self.df[geo_id] else: if fill_missing: # FIRST find which rows are not already in the meta_df union_rows = np.union1d(self.df[geo_id].index, meta_df.index) all_cols = np.concatenate((self.df[geo_id].columns, meta_df.columns)) new_df = pd.DataFrame(0.00000, index=union_rows, columns=all_cols) in1d_rows = np.in1d(new_df.index,self.df[geo_id].index) in1d_rows_meta = np.in1d(new_df.index, meta_df.index) ind_df_rows = np.array(list(range(len(in1d_rows))))[in1d_rows] ind_meta_rows = np.array(list(range(len(in1d_rows_meta))))[in1d_rows_meta] # SPEED BOTTLENECK # TODO IS this optimal? for c,col in enumerate(self.df[geo_id].columns): count += 1 if count % 10 == 0: print("Merge progress {:2.0%}".format(count/total_iteration), end="\r") for r,row in enumerate(ind_df_rows): new_df.values[row, c] = self.df[geo_id].values[r,c] for c,col in enumerate(meta_df.columns): count += 1 if count % 10 == 0: print("Merge progress {:2.0%}".format(count/total_iteration), end="\r") for r,row in enumerate(ind_meta_rows): new_df.values[row, c] = meta_df.values[r,c] else: df_intersection_meta = np.in1d(self.df[geo_id].index, meta_df.index) meta_intersection_df = np.in1d(meta_df.index, self.df[geo_id].index) tmp = self.df[geo_id][df_intersection_meta] meta_df = meta_df[meta_intersection_df] new_df = pd.concat([tmp, meta_df], axis=1) try: assert len(self.meta_df.index) == len(set(self.meta_df.index)) except: print("CONTAINS DUPLICATED ROWNAMES") self.meta_filename = meta_filename self.meta_file_path = '/'.join([self.meta_data_folder_path, self.meta_filename]) self.meta_df = new_df self.meta_df.to_pickle(self.meta_file_path) else: self.meta_filename = meta_file self.meta_file_path = '/'.join([self.meta_data_folder_path, self.meta_filename]) self.meta_df = pd.read_pickle(self.meta_file_path) print("Merged dataset imported from disk.")
def ordination2d(data_frame, ord_type, images_folder_path, dataset_name, epoch, a=0.5, verbose=0, info="none", show_images=True): import pandas as pd import numpy as np try: assert type(data_frame) == pd.core.frame.DataFrame except: print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)") print(type(data_frame)) exit() return if type(dataset_name) == list: names = [name for name in dataset_name] dataset_name = "_".join(names) y = np.array(data_frame.columns, dtype=str) classes_list = np.unique(y) data_frame.values[np.isnan(data_frame.values)] = 0 ord = None ys = False if ord_type in ["pca", "PCA"]: ys = False ord = PCA(n_components=2) elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]: ys = False ord = TSNE(n_components=2, verbose=verbose) elif ord_type in ["lda", "LDA", "flda", "FLDA"]: ys = True ord = LDA(n_components=2) elif ord_type in ["qda", "QDA"]: ord = QDA() ys = True else: exit("No ordination of that name is implemented. Exiting...") if ys: principal_components = ord.fit_transform(np.transpose(data_frame.values)) else: principal_components = ord.fit_transform(np.transpose(data_frame.values), y=data_frame.columns) principal_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2']) final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('2 component Ordination', fontsize=20) colors = cm.viridis(np.linspace(0, 1, len(classes_list))) for t, target in enumerate(classes_list): indices_to_keep = final_df[0] == target indices_to_keep = list(indices_to_keep) data1 = final_df.loc[indices_to_keep, 'principal component 1'] data2 = final_df.loc[indices_to_keep, 'principal component 2'] try: assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0 except: print("Nans were detected. Please verify the DataFrame...") exit() ellipse_data(data1, data2, ax, colors[t]) ax.scatter(data1, data2, s=20, alpha=a, linewidths=0.5, edgecolor='k', c=colors[t]) ax.legend(classes_list) ax.grid() handles, labels = ax.get_legend_handles_labels() ax.legend(handles, classes_list) try: plt.tight_layout() fig.tight_layout() except: pass type_images_folder_path = "/".join([images_folder_path, str(ord_type), str(dataset_name)]) + "/" type_images_folder_path = type_images_folder_path + info + "/" create_missing_folders(type_images_folder_path) plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100) if show_images: plt.show() plt.close(fig)