Esempio n. 1
0
    def display_reconstruction(self, data, reconstruction):
        self.eval()
        print("GENERATING RECONSTRUCTION IMAGES autoencoder!")
        hparams_string = "/".join([
            "num_elements" + str(self.num_elements),
            "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last),
            "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder),
            self.flavour
        ])
        x = data.view(-1, self.input_shape[0], self.input_shape[1],
                      self.input_shape[2]).data
        x_grid = tv.utils.make_grid(x)
        x_recon = reconstruction.view(-1, self.input_shape[0],
                                      self.input_shape[1],
                                      self.input_shape[2]).data
        x_recon_grid = tv.utils.make_grid(x_recon)
        images_path = self.images_path + hparams_string + "/recon/" + self.prior_dist + "/"
        print("Images location:", images_path)

        create_missing_folders(images_path)
        tv.utils.save_image(
            x_grid, images_path + "original_" + str(self.epoch) + ".png")
        tv.utils.save_image(
            x_recon_grid,
            images_path + "reconstruction_example_" + str(self.epoch) + ".png")
Esempio n. 2
0
    def generate_random(self, max=1000):
        self.eval()
        print("GENERATING RANDOM IMAGES autoencoder!")
        hparams_string = "/".join([
            "num_elements" + str(self.num_elements),
            "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last),
            "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder),
            self.flavour, self.prior_dist
        ])
        images_path = self.images_path + hparams_string + "/generated_random/" + self.prior_dist + "/"
        create_missing_folders(images_path)

        rand_z = torch.randn(self.batch_size, self.z_dim_last).cuda()
        self.plot_z_stats(rand_z.detach().cpu().numpy(),
                          generate="/random_generated/" + self.prior_dist +
                          "/",
                          path=images_path,
                          max=max)
        new_x = self.sample(rand_z)
        if len(self.input_shape) > 1:
            images = new_x.view(-1, self.input_shape[0], self.input_shape[1],
                                self.input_shape[2]).data
            images_grid = tv.utils.make_grid(images)
            print("Images location:", images_path)
            tv.utils.save_image(
                images_grid, images_path + str(self.epoch) +
                self.dataset_name + "generated.png")
            del images_grid, images
        del rand_z, new_x, images_path, hparams_string
Esempio n. 3
0
    def loadGEO(self, geo_id='GSE22845', dataframes_folder="dataframes"):
        """

        :param geo_id:
        :return:

        Example:
        from debug.get_parameters import *

        dataframes_folder = "/Users/simonpelletier/data/hebbian_learning_ann"
        g = geoParser(destination_folder=data_destination)
        g.getGEO(geo_ids,is_load_from_disk)

        """
        import pandas as pd
        import numpy as np
        flag = False
        print('Loading ' + geo_id + ' ...')

        self.df_file_name = geo_id + '_dataframe.pickle.npy'
        create_missing_folders(self.dataframes_path)
        current_directory_list = os.listdir(self.dataframes_path)
        if self.df_file_name in current_directory_list:
            print("File found at location:",self.data_folder_path + "/" + self.df_file_name)
            self.df[geo_id] = pd.read_pickle(self.dataframes_path+"/"+self.df_file_name)

            if sum(sum(np.isnan(self.df[geo_id].values)).tolist()) > 0:
                print("Nans found. They are all replaced by 0")
                self.df[geo_id][np.isnan(self.df[geo_id] )] = 0

        else:
            print(self.df_file_name ,' NOT FOUND in ', self.dataframes_path)
            print(current_directory_list)
            flag = True
        return flag
Esempio n. 4
0
def plot_performance(loss_total, accuracy, labels, results_path, filename="NoName", verbose=0, std_loss=None, std_accuracy=None):
    """

    :param loss_total:
    :param loss_labelled:
    :param loss_unlabelled:
    :param accuracy:
    :param labels:
    :param results_path:
    :param filename:
    :param verbose:
    :return:
    """
    fig2, ax21 = plt.subplots()
    n = list(range(len(accuracy["train"])))
    try:
        ax21.plot(loss_total["train"], 'b-', label='Train total loss:' + str(len(labels["train"])))  # plotting t, a separately
        ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:' + str(len(labels["valid"])))  # plotting t, a separately
        #ax21.plot(values["valid"], 'r-', label='Test:' + str(len(labels["valid"])))  # plotting t, a separately
    except:
        ax21.plot(loss_total["train"], 'b-', label='Train total loss:')  # plotting t, a separately
        ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:')  # plotting t, a separately
    if std_accuracy is not None:
        ax21.errorbar(x=n, y=loss_total["train"], yerr=[np.array(std_loss["train"]), np.array(std_loss["train"])],
                      c="b", label='Train')  # plotting t, a separately
    if std_accuracy is not None:
        ax21.errorbar(x=n, y=loss_total["valid"], yerr=[np.array(std_loss["valid"]), np.array(std_loss["valid"])],
                      c="g", label='Valid')  # plotting t, a separately

    ax21.set_xlabel('epochs')
    ax21.set_ylabel('Loss')
    handles, labels = ax21.get_legend_handles_labels()
    ax21.legend(handles, labels)
    ax22 = ax21.twinx()

    #colors = ["b", "g", "r", "c", "m", "y", "k"]
    # if n_list is not None:
    #    for i, n in enumerate(n_list):
    #        ax22.plot(n_list[i], '--', label="Hidden Layer " + str(i))  # plotting t, a separately
    ax22.set_ylabel('Accuracy')
    ax22.plot(accuracy["train"], 'c--', label='Train')  # plotting t, a separately
    ax22.plot(accuracy["valid"], 'k--', label='Valid')  # plotting t, a separately
    if std_accuracy is not None:
        ax22.errorbar(x=n, y=accuracy["train"], yerr=[np.array(std_accuracy["train"]), np.array(std_accuracy["train"])],
                      c="c", label='Train')  # plotting t, a separately
    if std_accuracy is not None:
        ax22.errorbar(x=n, y=accuracy["valid"], yerr=[np.array(std_accuracy["valid"]), np.array(std_accuracy["valid"])],
                      c="k", label='Valid')  # plotting t, a separately

    handles, labels = ax22.get_legend_handles_labels()
    ax22.legend(handles, labels)

    fig2.tight_layout()
    # pylab.show()
    if verbose > 0:
        print("Performance at ", results_path)
    create_missing_folders(results_path + "/plots/")
    pylab.savefig(results_path + "/plots/" + filename)
    plt.show()
    plt.close()
Esempio n. 5
0
    def generate_uniform_gaussian_percentiles(self, epoch=0, verbose=0, show_pca=0, show_lda=0, n=20, drop_na=False):
        zs_grid = torch.stack([torch.Tensor(np.vstack([np.linspace(norm.ppf(0.05), norm.ppf(0.95), n**2)
                                                       for _ in range(self.z_dim_last)]).T)
                               for _ in range(self.num_classes)])

        # I get much better results squeezing values with tanh

        hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows),
                                   "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr),
                                   "ladder"+str(self.ladder), self.flavour, "n_labelled"+str(len(self.train_loader))])
        images_path = self.results_path + "/" + hparams_string + "/gaussian_percentiles/"
        if verbose > 0:
            print("GENERATING SS DGM IMAGES AT", images_path)

        y = torch.stack([torch.Tensor(onehot_array(n**2*[i], self.num_classes)) for i in range(n)])
        x_mu = [self.sample(torch.Tensor(zs_grid[i]).cuda(), y[i]) for i in range(self.num_classes)]

        # plot_z_stats(rand_z.detach().cpu().numpy(), generate="generated")
        labels_set_ints = list(range(len(self.labels_set)))
        if len(self.input_shape) > 1:
            images = torch.stack([x_mu[i].view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data
                      for i in range(len(x_mu))])
            images = images.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2])
            images_grid = tv.utils.make_grid(images, n)
            create_missing_folders(images_path)
            tv.utils.save_image(images_grid, images_path + "/" + str(epoch) + "gaussian_percentiles_generated.png")
Esempio n. 6
0
 def save_model(self):
     # SAVING
     print("MODEL SAVED AT LOCATION:", self.model_history_path)
     create_missing_folders(self.model_history_path)
     torch.save(
         self.state_dict(), self.model_history_path + self.flavour + "_" +
         self.model_file_name + '.state_dict')
     if self.ssl:
         torch.save(
             self.classifier.state_dict(),
             self.model_history_path + self.flavour + "_" +
             self.model_file_name + 'classifier.state_dict')
     torch.save(
         self.train_loss_history, self.model_history_path + self.flavour +
         "_" + self.model_file_name + '.train_loss')
     torch.save(
         self.train_rec_history, self.model_history_path + self.flavour +
         "_" + self.model_file_name + '.train_re')
     torch.save(
         self.train_kl_history, self.model_history_path + self.flavour +
         "_" + self.model_file_name + '.train_kl')
     torch.save(
         self.val_loss_history, self.model_history_path + self.flavour +
         "_" + self.model_file_name + '.val_loss')
     torch.save(
         self.val_rec_history, self.model_history_path + self.flavour +
         "_" + self.model_file_name + '.val_re')
     torch.save(
         self.val_kl_history, self.model_history_path + self.flavour + "_" +
         self.model_file_name + '.val_kl')
     torch.save(
         self.epoch, self.model_history_path + self.flavour + "_" +
         self.model_file_name + '.epoch')
Esempio n. 7
0
def plot_losses(losses,
                labels,
                n_neurons=None,
                results_path="~",
                filename="NoName"):
    filename = "_".join([filename, "loss.png"])
    create_missing_folders(results_path + "/plots/hnet/")
    fig, ax1 = plt.subplots()
    plt.ylim([0., 1000.])

    ax1.plot(losses, 'g-.', label='train')  # plotting t, a separately

    ax1.set_xlabel('epochs')
    ax1.set_ylabel('Loss')

    # ax1.tick_params('y')
    handles, labels = ax1.get_legend_handles_labels()
    ax1.legend(handles, labels)
    if n_neurons is not None:
        ax22 = ax1.twinx()
        for i, n in enumerate(n_neurons):
            ax22.plot(n_neurons[i], '--', label="Hidden Layer " +
                      str(i))  # plotting t, a separately
        ax22.set_ylabel('#Neurons')
        handles, labels = ax22.get_legend_handles_labels()
        ax22.legend(handles, labels)

    fig.tight_layout()
    # pylab.show()
    pylab.savefig(results_path + "/plots/hnet/" + filename)
    plt.close()
Esempio n. 8
0
    def input_pruning(self, results_path, min_n_input_dims=20, minimum_neurons=20):
        """
        :param net:
        :param gt:
        :param min_n_input_dims:
        :param minimum_neurons:
        :return:
        """
        self.eval()
        with torch.no_grad():
            hebb_input = self.hebb_input_values.data.copy_(self.hebb_input_values.data).cpu().numpy()
            if len(hebb_input) >= min_n_input_dims:
                to_keep = hebb_input > float(self.gt_input)
                notTooUsed = hebb_input < float(self.lt_input)
                print("min_hebb_value:", self.gt_input)
                valid_indices = indices_h(to_keep)
                valid_indices_down = indices_h(notTooUsed)
                total_valid = np.intersect1d(valid_indices, valid_indices_down)
                if len(valid_indices) < minimum_neurons:
                    # TODO Replace neurons that could not be removed?
                    valid_indices = indices_h(torch.sort(hebb_input)[1] < minimum_neurons)
                    print("Minimum neurons on layer 1", sep="\t", file=self.hebb_log)

                print("previous_valid_len", self.previous_valid_len)
                self.valid_bool = [1. if x in valid_indices else 0. for x in range(self.input_size)]
                self.valid_bool_down = [1. if x in valid_indices_down else 0. for x in range(self.input_size)]
                self.valid_bool_total = [1. if x in total_valid else 0. for x in range(self.input_size)]
                self.alive_inputs = [x for x in range(len(hebb_input)) if x in valid_indices]
                self.alive_inputs_down = [x for x in range(len(hebb_input)) if x in valid_indices_down]
                self.alive_inputs_total = [x for x in range(len(hebb_input)) if x in total_valid]
                alive_inputs = np.array(self.alive_inputs)
                #if len(self.alive_inputs) < self.previous_valid_len:
                masks_path = results_path + "/images/masks/" + str(self.dataset_name) + "/"
                create_missing_folders(masks_path)

                img_path = "_".join(["alive_inputs", str(len(valid_indices_down)), str(self.epoch), "down.png"])
                print("self.n_channels", self.n_channels)
                if len(self.input_shape) == 3:
                    print("SAVING MASK at", results_path)
                    mask = np.reshape(self.valid_bool_down, newshape=(28, 28))  # TODO change hard coding
                    plt.imsave(masks_path + img_path, mask)
                img_path = "_".join(["alive_inputs", str(len(total_valid)), str(self.epoch), "total.png"])
                print("self.n_channels", self.n_channels)
                if len(self.input_shape) == 3:
                    print("SAVING MASK at", results_path)
                    mask = np.reshape(self.valid_bool_total, newshape=(28, 28))  # TODO change hard coding
                    plt.imsave(masks_path + img_path, mask)
                img_path = "_".join(["alive_inputs", str(len(valid_indices)), str(self.epoch), "up.png"])
                print("self.n_channels", self.n_channels)
                if len(self.input_shape) == 3:
                    print("SAVING MASK at", results_path)
                    mask = np.reshape(self.valid_bool, newshape=(28, 28))  # TODO change hard coding
                    plt.imsave(masks_path + img_path, mask)

                self.previous_valid_len = len(valid_indices)
                self.valid_bool_tensor = self.valid_bool_tensor * torch.Tensor(self.valid_bool).cuda()
                return self.valid_bool, self.alive_inputs
Esempio n. 9
0
    def calculate_losses(self,
                         data,
                         lambda1=0.,
                         lambda2=0.,
                         beta=1.,
                         likelihood=F.mse_loss):
        if self.ladder:
            ladder = "ladder"
        else:
            ladder = "not_ladder"
        self.images_path = self.results_path + "/images/examples/generative/" + ladder + "/" + self.flavour + "/"
        create_missing_folders(self.images_path)
        data = torch.tanh(data)
        if self.flow_type in ["o-sylvester", "t-sylvester", "h-sylvester"
                              ] and not self.ladder:
            z_q = {0: None, 1: None}
            reconstruction, mu, log_var, self.log_det_j, z_q[0], z_q[
                -1] = self.run_sylvester(data, auxiliary=self.auxiliary)
            log_p_zk = log_standard_gaussian(z_q[-1])
            # ln q(z_0)  (not averaged)
            # mu, log_var, r1, r2, q, b = q_param_inverse
            log_q_z0 = log_gaussian(z_q[0], mu,
                                    log_var=log_var) - self.log_det_j
            # N E_q0[ ln q(z_0) - ln p(z_k) ]
            self.kl_divergence = log_q_z0 - log_p_zk
            del log_q_z0, log_p_zk
        else:
            reconstruction, z_q = self(data)

        kl = beta * self.kl_divergence

        likelihood = torch.sum(likelihood(reconstruction,
                                          data.float(),
                                          reduce=False),
                               dim=-1)

        if self.ladder:
            params = torch.cat(
                [x.view(-1) for x in self.reconstruction.parameters()])
        else:
            params = torch.cat(
                [x.view(-1) for x in self.decoder.reconstruction.parameters()])

        l1_regularization = lambda1 * torch.norm(params, 1).cuda()
        l2_regularization = lambda2 * torch.norm(params, 2).cuda()
        try:
            assert l1_regularization >= 0. and l2_regularization >= 0.
        except:
            print(l1_regularization, l2_regularization)
        loss = torch.mean(likelihood + kl.cuda() + l1_regularization +
                          l2_regularization)

        del data, params, l1_regularization, l2_regularization, lambda1, lambda2

        return loss, torch.mean(likelihood), torch.mean(
            kl), reconstruction, z_q
Esempio n. 10
0
    def load_geo(self, geo_id, labelled, bad_example=False):
        """

        :param geo_id:
        :return:

        Example:
        from debug.get_parameters import *

        dataframes_folder = "/Users/simonpelletier/data/annleukemia"
        g = GeoParser(destination_folder=data_destination)
        g.get_geo(geo_ids,load_from_disk)

        """
        flag = False
        print('Loading ' + geo_id + ", labelled: " + str(labelled) + ' ...')

        if not bad_example:
            self.df_file_name = geo_id + "_labelled" + str(
                labelled) + '_dataframe.pickle.npy'
        else:
            self.df_file_name = geo_id + "_labelled" + str(
                labelled) + '_bad_dataframe.pickle.npy'

        create_missing_folders(self.dataframes_path)
        current_directory_list = os.listdir(self.dataframes_path)
        if self.df_file_name in current_directory_list:
            print("File found at location:",
                  self.data_folder_path + "/" + self.df_file_name)
            if labelled or bad_example:
                self.df[geo_id] = pd.read_pickle(self.dataframes_path + "/" +
                                                 self.df_file_name)

                if sum(sum(np.isnan(self.df[geo_id].values)).tolist()) > 0:
                    print("Nans found. They are all replaced by 0")
                    self.df[geo_id][np.isnan(self.df[geo_id])] = 0
                print("self.df[geo_id]", self.df[geo_id].shape)
            else:
                self.unlabelled_df[geo_id] = pd.read_pickle(
                    self.dataframes_path + "/" + self.df_file_name)

                if sum(
                        sum(np.isnan(
                            self.unlabelled_df[geo_id].values)).tolist()) > 0:
                    print("Nans found. They are all replaced by 0")
                    self.unlabelled_df[geo_id][np.isnan(
                        self.unlabelled_df[geo_id])] = 0
                print("self.unlabelled_df[geo_id]",
                      self.unlabelled_df[geo_id].shape)

        else:
            print(self.df_file_name, ' NOT FOUND in ', self.dataframes_path)
            flag = True
        return flag
Esempio n. 11
0
def histograms_hidden_layers(xs,
                             results_path,
                             normalized,
                             is_mean=True,
                             epoch=0,
                             depth=0,
                             activated=False,
                             mu=None,
                             var=None,
                             axis=0,
                             bins=50,
                             flat=True,
                             neuron=None):
    ax = plt.subplot(111)
    ax.set_xlabel("Hidden value")
    ax.set_ylabel("Frequency")
    plt.title("PDF of preactivation values")

    if neuron is None:
        neurons = "all"
    else:
        neurons = "single"
        xs = xs[:, neuron]

    if is_mean:
        xs = np.mean(xs, axis=axis)
    ax.hist(xs, bins=bins, alpha=0.5, density=True)

    if mu is None and var is None:
        mean_mean = float(np.mean(xs))
        mean_var = float(np.var(xs))
    elif mu is not None and var is not None:
        mean_mean = float(mu)
        mean_var = float(var)
    else:
        print(
            "No images saved. Both mu and var must be either None or both numpy"
        )
        return
    normal_curve(ax, mean_mean, mean_var)
    if activated:
        plt.axvline(x=float(np.mean(xs)), c="g", linewidth=1)

    #    half_normal_curve(ax, mu, var, float(np.mean(xs)))
    destination_folder_path = "/".join(
        (results_path, "layers_histograms", "depth_" + str(depth),
         "activated_" + str(activated), "normalized_" + str(normalized))) + "/"
    create_missing_folders(destination_folder_path)
    destination_file_path = destination_folder_path + "Hidden_values_hist_" + str(epoch) + "_activated"+ \
                            str(activated) + "_normalized" + str(normalized) + "_mean" + str(is_mean) + "_flat"\
                            + str(flat) + "_" + neurons + "neurons.png"
    plt.savefig(destination_file_path)
    plt.close()
Esempio n. 12
0
def ordination2d(
        data_frame,
        ORD=PCA,
        images_folder_path="/home/simon/results/hebbian_learning_ann/plots/",
        filenames="NoName",
        a=0.5):
    type_images_folder_path = images_folder_path + filenames + "/"
    create_missing_folders(type_images_folder_path)
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        return

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)

    ord = ORD(n_components=2, verbose=1)
    principalComponents = ord.fit_transform(np.transpose(data_frame.values))
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component tSNE', fontsize=20)
    colors = ['r', 'g', 'b']
    for target, color in zip(classes_list, colors):
        indicesToKeep = finalDf[0] == target
        data1 = finalDf.loc[indicesToKeep, 'principal component 1']
        data2 = finalDf.loc[indicesToKeep, 'principal component 2']
        ellipse_data(data1, data2, ax, color)

        ax.scatter(data1, data2, c=color, s=12)
    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    plt.tight_layout()
    fig.tight_layout()

    fig.savefig(images_folder_path + type_ord + filenames + ".png")
    plt.close(fig)
Esempio n. 13
0
    def generate_random(self, epoch=0, verbose=0, show_pca=1, show_lda=1, n=40, drop_na=False, keep_images=True,
                        only_na=False):
        hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows),
                                   "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr),
                                   "ladder"+str(self.ladder), self.flavour])
        images_path = self.results_path + "/" + hparams_string + "/random/"
        create_missing_folders(images_path)
        if verbose > 0:
            print("GENERATING IMAGES AT", images_path)
        self.eval()

        rand_z = Variable(torch.randn(n*self.num_classes, self.z_dim))

        if not only_na:
            y = torch.cat([torch.Tensor(onehot_array(n*[i], self.num_classes)) for i in range(self.num_classes)])
        else:
            y = torch.cat(torch.Tensor(onehot_array(n*[self.num_classes], self.num_classes)))

        rand_z, y = rand_z.cuda(), y.cuda()
        x_mu = self.sample(rand_z, y)

        # plot_z_stats(rand_z.detach().cpu().numpy(), generate="generated")

        if len(self.input_shape) > 1 and keep_images:
            images = x_mu.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data
            images_grid = tv.utils.make_grid(images, 20)
            tv.utils.save_image(images_grid, images_path + "/" + str(epoch) + "only_na:" + str(only_na) +
                                "_generated.png")
        colnames = [list(self.labels_set)[one_hot.cpu().numpy().tolist().index(1)] for one_hot in y]
        df = pd.DataFrame(x_mu.transpose(1, 0).detach().cpu().numpy(), columns=colnames)
        if drop_na:
            try:
                df = df.drop(["N/A"], axis=1)
            except:
                pass
        if show_pca != 0 and epoch % show_pca == 0 and epoch != 0:
            try:
                ordination2d(df, "pca", epoch=self.epoch, images_folder_path=images_path, dataset_name=self.dataset_name, a=0.5,
                     verbose=0, info="generated")
            except:
                print("No pca.")
        if show_lda != 0 and epoch % show_lda == 0 and epoch != 0:
            try:
                ordination2d(df, "lda", epoch=self.epoch, images_folder_path=images_path, dataset_name=self.dataset_name, a=0.5,
                     verbose=0, info="generated")
            except:
                print("NO lda")
        del df, colnames, images_grid, x_mu, rand_z, y

        return images
Esempio n. 14
0
def QDA(data_frame, images_folder_path, dataset_name, epoch, a=0.5, verbose=0, info="none",
                 show_images=True):
    import pandas as pd
    import numpy as np
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)")
        print(type(data_frame))
        exit()
        return
    if type(dataset_name) == list:
        names = [name for name in dataset_name]
        dataset_name = "_".join(names)

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)
    data_frame.values[np.isnan(data_frame.values)] = 0

    X = np.transpose(data_frame.values)
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
    plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant'
             'Analysis')
    try:
        plt.tight_layout()
    except:
        pass
    type_images_folder_path = "/".join([images_folder_path, str(dataset_name)]) + "/"
    type_images_folder_path = type_images_folder_path + info + "/"

    create_missing_folders(type_images_folder_path)

    plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100)
    if show_images:
        plt.show()
    plt.close()
Esempio n. 15
0
    def display_reconstruction(self, epoch, data, reconstruction, display_rate=1):
        hparams_string = "/".join(["num_elements"+str(self.num_elements), "n_flows"+str(self.n_flows),
                                   "z_dim"+str(self.z_dim_last), "a_dim"+str(self.a_dim), "lr"+str(self.lr),
                                   "ladder"+str(self.ladder), self.flavour])
        images_path = self.results_path + "/" + hparams_string + "/reconstruction/"
        create_missing_folders(images_path)
        x = data.view(-1, self.input_shape[0], self.input_shape[1], self.input_shape[2]).data
        x_grid = tv.utils.make_grid(x)
        x_recon = reconstruction.view(-1, self.input_shape[0], self.input_shape[1],
                                      self.input_shape[2]).data
        x_recon_grid = tv.utils.make_grid(x_recon)

        if epoch % display_rate == 0:
            print("GENERATING RECONSTRUCTION IMAGES autoencoder!")
            tv.utils.save_image(x_grid, images_path + str(epoch) + "_original.png")
            tv.utils.save_image(x_recon_grid, images_path + str(epoch) + "_reconstruction_example.png")
Esempio n. 16
0
    def plot_z_stats(self, z, path, generate="generated", max=5000):
        fig, ax = plt.subplots()  # create figure and axis
        plt.boxplot(z)
        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, labels)
        plt.tight_layout()
        fig.tight_layout()
        path = "/".join([path, "plots/vae_z_stats", generate]) + "/"
        create_missing_folders(path)
        fig.savefig(path + self.flavour + "_" + str(self.epoch) + '_lr' +
                    str(self.lr) + '_bs' + str(self.batch_size) + ".png")
        plt.close(fig)

        if z.shape[1] == 2:
            self.plot_z(generated=generate)
        del z, path, generate
Esempio n. 17
0
 def save_model(self):
     # SAVING
     print("MODEL (with classifier) SAVED AT LOCATION:", self.model_history_path)
     create_missing_folders(self.model_history_path)
     torch.save(self.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name +'.state_dict')
     torch.save(self.classifier.state_dict(), self.model_history_path + self.flavour + "_" + self.model_file_name +'classifier.state_dict')
     torch.save(self.train_total_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_total_loss')
     torch.save(self.train_labelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_labelled_loss')
     torch.save(self.train_unlabelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_unlabelled_loss')
     torch.save(self.train_accuracy_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_accuracy')
     torch.save(self.train_kld_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.train_kld')
     torch.save(self.valid_total_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_total_loss')
     torch.save(self.valid_labelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_labelled_loss')
     torch.save(self.valid_unlabelled_loss_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_unlabelled_loss')
     torch.save(self.valid_accuracy_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_accuracy')
     torch.save(self.valid_kld_history, self.model_history_path + self.flavour + "_" + self.model_file_name + '.valid_kld')
     torch.save(self.epoch, self.model_history_path + self.flavour + "_" + self.model_file_name + '.epoch')
Esempio n. 18
0
def plot_performance(loss_total, loss_labelled, loss_unlabelled, accuracy, labels, results_path,
                     filename="NoName", verbose=0):
    fig2, ax21 = plt.subplots()
    try:
        ax21.plot(loss_total["train"], 'b-', label='Train total loss:' + str(len(labels["train"])))  # plotting t, a separately
        ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:' + str(len(labels["valid"])))  # plotting t, a separately
        ax21.plot(loss_labelled["train"], 'b-.', label='Train labelled loss:' + str(len(labels["train"])))  # plotting t, a separately
        ax21.plot(loss_labelled["valid"], 'g-.', label='Valid labelled loss:' + str(len(labels["valid"])))  # plotting t, a separately
        ax21.plot(loss_unlabelled["train"], 'b.', label='Train unlabelled loss:' + str(len(labels["train"])))  # plotting t, a separately
        ax21.plot(loss_unlabelled["valid"], 'g.', label='Valid unlabelled loss:' + str(len(labels["valid"])))  # plotting t, a separately
        #ax21.plot(values["valid"], 'r-', label='Test:' + str(len(labels["valid"])))  # plotting t, a separately
    except:
        ax21.plot(loss_total["train"], 'b-', label='Train total loss:')  # plotting t, a separately
        ax21.plot(loss_total["valid"], 'g-', label='Valid total loss:')  # plotting t, a separately
        ax21.plot(loss_labelled["train"], 'b-.', label='Train labelled loss:')  # plotting t, a separately
        ax21.plot(loss_labelled["valid"], 'g-.', label='Valid labelled loss:')  # plotting t, a separately
        ax21.plot(loss_unlabelled["train"], 'b.', label='Train unlabelled loss:')  # plotting t, a separately
        ax21.plot(loss_unlabelled["valid"], 'g.', label='Valid unlabelled loss:')  # plotting t, a separately

    ax21.set_xlabel('epochs')
    ax21.set_ylabel('Loss')
    handles, labels = ax21.get_legend_handles_labels()
    ax21.legend(handles, labels)
    ax22 = ax21.twinx()

    #colors = ["b", "g", "r", "c", "m", "y", "k"]
    # if n_list is not None:
    #    for i, n in enumerate(n_list):
    #        ax22.plot(n_list[i], '--', label="Hidden Layer " + str(i))  # plotting t, a separately
    ax22.set_ylabel('Accuracy')
    ax22.plot(accuracy["train"], 'b--', label='Train')  # plotting t, a separately
    ax22.plot(accuracy["valid"], 'g--', label='Valid')  # plotting t, a separately
    handles, labels = ax22.get_legend_handles_labels()
    ax22.legend(handles, labels)

    fig2.tight_layout()
    # pylab.show()
    if verbose > 0:
        print("Performance at ", results_path)
    create_missing_folders(results_path + "/plots/")
    pylab.savefig(results_path + "/plots/" + filename)
    plt.show()
    plt.close()
Esempio n. 19
0
    def generate_uniform_gaussian_percentiles(self, n=20, verbose=1, max=1000):
        self.eval()
        print("GENERATING gaussian percentiles IMAGES autoencoder!")

        xs_grid = torch.Tensor(
            np.vstack([
                np.linspace(norm.ppf(0.01), norm.ppf(0.99), n**2)
                for _ in range(self.z_dim_last)
            ]).T)

        hparams_string = "/".join([
            "num_elements" + str(self.num_elements),
            "n_flows" + str(self.n_flows), "z_dim" + str(self.z_dim_last),
            "unsupervised", "lr" + str(self.lr), "ladder" + str(self.ladder),
            self.flavour
        ])
        images_path = self.images_path + "/" + hparams_string + "/gaussian_percentiles/" + "/" + self.prior_dist + "/"
        if verbose > 0:
            print("GENERATING SS DGM IMAGES AT", images_path)

        print("image path:", images_path)
        create_missing_folders(images_path)

        self.plot_z_stats(xs_grid,
                          generate="/ugp_generated/" + self.prior_dist + "/",
                          path=images_path,
                          max=max)
        grid = torch.Tensor(xs_grid).to(device)

        new_x = torch.stack([self.sample(g.view(1, -1)) for g in grid])
        if len(self.input_shape) > 1:
            images = new_x.view(-1, self.input_shape[0], self.input_shape[1],
                                self.input_shape[2]).data

            assert n == int(images.shape[0]) / n
            images_grid = tv.utils.make_grid(images,
                                             int(np.sqrt(images.shape[0])))

            create_missing_folders(images_path)
            tv.utils.save_image(
                images_grid, images_path + str(self.epoch) +
                self.dataset_name + "gaussian_uniform_generated.png")
            del images_grid, images, new_x, xs_grid
Esempio n. 20
0
    def translate(self,
                  geo_id,
                  labelled,
                  old_ids='entrezgene_trans_name',
                  new_ids='uniprot_gn',
                  load=True):
        import subprocess
        translation_destination = "/".join(
            [self.data_folder_path, "translation_results"]) + "/"
        self.translation_destination = translation_destination
        dictionary_path = self.dictionary_path = "/".join(
            [self.data_folder_path, "dictionaries"])
        create_missing_folders(translation_destination)
        create_missing_folders(dictionary_path)
        filename = geo_id + "_" + old_ids + '.txt'
        output_file = geo_id + "_" + old_ids + "2" + new_ids + ".txt"
        output_path = translation_destination + "/" + output_file

        if filename not in os.listdir(
                translation_destination) or load is False:
            print("new file in" + translation_destination)
            f = open(translation_destination + "/" + filename, "w")
            for id in list(self.meta_df[labelled].index):
                f.write(str(id) + "\n")
            f.close()
        if output_file not in os.listdir(translation_destination):
            print("Translating", geo_id, "from", old_ids, "to", new_ids, "...")
            call = [
                "./biomart_api.R", translation_destination, geo_id, old_ids,
                new_ids
            ]
            subprocess.call(call)
        else:
            print("The file", output_file, "was found in",
                  translation_destination)
        file = open(output_path, "r")
        names_translations = np.loadtxt(file, dtype=str, delimiter=";")
        try:
            assert len(names_translations) > 0
        except:
            print("There is not translation to show")
        return names_translations
Esempio n. 21
0
    def translate(self, geo_id, old_ids='refseq_mrna', new_ids='uniprot_gn', load=False):
        import subprocess
        translation_destination = "/".join([self.data_folder_path, "translation_results"]) + "/"
        self.translation_destination = translation_destination
        dictionary_path = self.dictionary_path = "/".join([self.data_folder_path, "dictionaries"])
        create_missing_folders(translation_destination)
        create_missing_folders(dictionary_path)
        filename = geo_id + "_" + old_ids + '.txt'
        try:
            os.remove(translation_destination + "/" + filename)
        except OSError:
            pass
        os.mknod(translation_destination + "/" + filename)
        f = open(translation_destination + "/" + filename, "w")
        for id in list(self.meta_df.index):
            f.write(id+"\n")

        f.close()

        translation_filename = old_ids + "2" + new_ids + "_" + geo_id + ".txt.npy"
        self.translation_results[geo_id] = translation_destination
        create_missing_folders(self.translation_results[geo_id])
        if translation_filename not in translation_destination or load is False:
            print("Translating", geo_id, "from", old_ids, "to", new_ids, "...")
            call = ["./biomart_api.R", translation_destination, geo_id, old_ids, new_ids]
            subprocess.call(call)
        output_file = translation_destination + "/" + geo_id + "_" +old_ids+"2"+new_ids+".txt"
        file = open(output_file, "r")
        names_translated = file.readlines()

        print(names_translated)
Esempio n. 22
0
def plot_performance(values,
                     labels,
                     n_list=None,
                     results_path="~",
                     filename="NoName"):
    fig2, ax21 = plt.subplots()

    ax21.plot(values["train"],
              'b-',
              label='Train:' +
              str(len(labels["train"])))  # plotting t, a separately
    ax21.plot(values["valid"],
              'g-',
              label='Valid:' +
              str(len(labels["valid"])))  # plotting t, a separately
    ax21.plot(values["valid"], 'r-', label='Test:' +
              str(len(labels["valid"])))  # plotting t, a separately
    ax21.set_xlabel('epochs')
    ax21.set_ylabel('Accuracy')
    handles, labels = ax21.get_legend_handles_labels()
    ax21.legend(handles, labels)
    ax22 = ax21.twinx()
    #colors = ["b", "g", "r", "c", "m", "y", "k"]
    if n_list is not None:
        for i, n in enumerate(n_list):
            ax22.plot(n_list[i], '--', label="Hidden Layer " +
                      str(i))  # plotting t, a separately
    ax22.set_ylabel('#Neurons')
    handles, labels = ax22.get_legend_handles_labels()
    ax22.legend(handles, labels)

    fig2.tight_layout()
    # pylab.show()
    pylab.savefig(results_path + "/plots/hnet/" + filename)
    create_missing_folders(results_path + "/plots/hnet/")
    plt.close()
Esempio n. 23
0
    def set_configs(self,
                    home_path,
                    results_folder="results",
                    data_folder="data",
                    destination_folder="hebbian_learning_ann",
                    dataset_name="GSE33000",
                    meta_destination_folder="meta_pandas_dataframes",
                    csv_filename="csv_loggers",
                    lr=1e-3):

        # Hyper-parameters
        self.lr = lr

        # Files names
        self.dataset_name = dataset_name
        self.filename = dataset_name + '_history'
        self.csv_filename = csv_filename

        # Folder names
        self.results_folder = results_folder
        self.destination_folder = destination_folder
        self.data_folder = data_folder
        self.meta_destination_folder = meta_destination_folder
        # Paths
        self.home_path = home_path
        self.results_path = "/".join(
            [self.home_path, self.destination_folder, self.results_folder])
        self.models_path = "/".join([self.results_path, "models"])
        self.model_history_path = self.models_path + "/history/"

        self.csv_logger_path = "/".join([self.results_path, csv_filename])
        self.data_folder_path = "/".join(
            [home_path, self.destination_folder, self.data_folder])
        self.meta_destination_path = "/".join(
            [self.data_folder_path, self.meta_destination_folder])
        create_missing_folders(self.csv_logger_path)
        create_missing_folders(self.models_path)
        create_missing_folders(self.meta_destination_path)
        create_missing_folders(self.model_history_path)

        # Empty lists
        self.accuracy_training_array = []
        self.accuracy_valid_array = []
        self.losses_training_array = []
        self.losses_valid_array = []
        self.max_valid_accuracies = []
        self.max_valid_epochs = []
        self.min_valid_loss = []
        self.min_valid_loss_epochs = []
        # empty objects
        self.model = None
        self.x_test = None
        self.y_test = None
        self.x_train = None
        self.y_train = None
        self.meta_df = None
        self.epoch = 0
        self.num_classes = None
        self.init = None
        self.batch_size = None
        self.nrep = None
        self.classes_train = None
        self.classes_test = None
Esempio n. 24
0
def ordination2d(data_frame,
                 ord_type,
                 images_folder_path,
                 dataset_name,
                 epoch,
                 a=0.4,
                 verbose=0,
                 info="none",
                 show_images=True,
                 df_valid=None,
                 df_test=None,
                 n=4):
    import pandas as pd
    import numpy as np

    pc1 = 'Component_1'
    pc2 = 'Component_2'

    type_images_folder_path = "/".join(
        [images_folder_path,
         str(ord_type), str(dataset_name)]) + "/"
    type_images_folder_path = type_images_folder_path + info + "/"

    create_missing_folders(type_images_folder_path)

    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        print(type(data_frame))
        exit()
        return
    if type(dataset_name) == list:
        names = [name for name in dataset_name]
        dataset_name = "_".join(names)

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)
    data_frame.values[np.isnan(data_frame.values)] = 0
    ord = None
    ys = False
    if ord_type in ["pca", "PCA"]:
        ys = False
        ord = PCA(n_components=2)
    elif ord_type in ["kpca", "KPCA"]:
        ys = False
        ord = KernelPCA(n_components=2, kernel="rbf")
    elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]:
        ys = False
        ord = TSNE(n_components=2, verbose=verbose)
    elif ord_type in ["lda", "LDA", "flda", "FLDA"]:
        ys = True
        ord = LDA(n_components=2)
    elif ord_type in ["qda", "QDA"]:
        ord = QDA()
        ys = True
    else:
        print(ord_type)
        exit("No ordination of that name is implemented. Exiting...")
    if ys:
        principal_components = ord.fit_transform(np.transpose(
            data_frame.values),
                                                 y=y)
        if df_valid is not None:
            pcs_valid = ord.transform(df_valid.values)
            pcs_valid = pd.DataFrame(
                data=pcs_valid,
                columns=['principal component 1', 'principal component 2'])
            y_valid = df_valid.columns
            pcs_valid = pd.concat([pcs_valid, pd.DataFrame(y_valid)], axis=1)

            pcs_test = ord.transform(df_test.values)
            pcs_test = pd.DataFrame(
                data=pcs_test,
                columns=['principal component 1', 'principal component 2'])
            y_test = df_valid.columns

            pcs_test = ord.transform(pcs_test.values)
            pcs_test = pd.concat([pcs_test, pd.DataFrame(y_test)], axis=1)

    else:
        principal_components = ord.fit_transform(
            np.transpose(data_frame.values))

    if ord_type == "pca":
        ev = ord.explained_variance_ratio_
        means = ord.mean_
        if sum(means < 0):
            means = means - min(means)
        means_ratio = means / np.sum(np.sum(means, axis=0)) * 100
        coeff = np.transpose(ord.components_)
        order_importance = list(reversed(np.argsort(means)))
        coeff, means_ratio = coeff[order_importance], means_ratio[
            order_importance]

        factors = np.array(data_frame.index)[order_importance]
        x = list(range(len(factors)))
        plt.xlabel("Initial Features")
        plt.ylabel("% of varaince explained")
        plt.title(
            "% of the variance is explained by the initial features (Total:" +
            str(np.round(np.sum(ev) * 100, 2)) + ")")
        plt.xticks([x[0]], [factors[0]], rotation=45, fontsize=8)
        plt.plot(means_ratio)
        plt.tight_layout()
        plt.savefig(type_images_folder_path + info + "_" + str(epoch) +
                    "_var_exaplined_2D.png",
                    dpi=100)
        print("plot at ", type_images_folder_path)

    principal_df = pd.DataFrame(
        data=principal_components,
        columns=['principal component 1', 'principal component 2'])
    final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    if ord_type not in "kpca":
        ev = ord.explained_variance_ratio_
        if len(ev) > 1:
            pc1 = pc1 + ': ' + str(np.round(ev[0] * 100, decimals=2)) + "%"
            pc2 = pc2 + ': ' + str(np.round(ev[1] * 100, decimals=2)) + "%"

    ax.set_xlabel(pc1, fontsize=15)
    ax.set_ylabel(pc2, fontsize=15)
    ax.set_title('2 component Ordination', fontsize=20)

    # colors = cm.viridis(np.linspace(0, 1, len(classes_list)))
    colors = ["g", "b", "k", "r"]
    print("coeff shape", coeff.shape)
    if len(coeff) < n:
        n = len(coeff)

    for t, target in enumerate(classes_list):
        indices_to_keep = final_df[0] == target
        indices_to_keep = list(indices_to_keep)
        data1 = final_df.loc[indices_to_keep, 'principal component 1']
        data2 = final_df.loc[indices_to_keep, 'principal component 2']
        try:
            assert np.sum(np.isnan(data1)) == 0 and np.sum(
                np.isnan(data2)) == 0
        except:
            print("Nans were detected. Please verify the DataFrame...")
            exit()
        ellipse_data(data1, data2, ax, colors[t])

        ax.scatter(data1, data2, s=10, alpha=a, c=colors[t])

        labels = factors
        for i in range(n):
            plt.arrow(0, 0, coeff[i, 0], coeff[i, 1], color='r', alpha=0.5)
            if labels is None:
                plt.text(coeff[i, 0] * 1.15,
                         coeff[i, 1] * 1.15,
                         "Var" + str(i + 1) + str(np.round(means_ratio[i], 2)),
                         color='g',
                         ha='center',
                         va='center')
            else:
                plt.text(coeff[i, 0] * 1.15,
                         coeff[i, 1] * 1.15,
                         str(labels[i]) + str(np.round(means_ratio[i], 2)),
                         color='g',
                         ha='center',
                         va='center')

    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    if df_valid is not None:
        for t, target in enumerate(classes_list):
            indices_to_keep = final_df[0] == target
            indices_to_keep = list(indices_to_keep)
            data1 = pcs_valid.loc[indices_to_keep, 'principal component 1']
            data2 = pcs_valid.loc[indices_to_keep, 'principal component 2']
            try:
                assert np.sum(np.isnan(data1)) == 0 and np.sum(
                    np.isnan(data2)) == 0
            except:
                print("Nans were detected. Please verify the DataFrame...")
                exit()
            ellipse_data(data1, data2, ax, colors[t])

            ax.scatter(data1, data2, s=10, alpha=a)
        ax.legend(classes_list)
        ax.grid()

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, classes_list)

    if df_test is not None:
        for t, target in enumerate(classes_list):
            indices_to_keep = final_df[0] == target
            indices_to_keep = list(indices_to_keep)
            data1 = pcs_test.loc[indices_to_keep, 'principal component 1']
            data2 = pcs_test.loc[indices_to_keep, 'principal component 2']
            try:
                assert np.sum(np.isnan(data1)) == 0 and np.sum(
                    np.isnan(data2)) == 0
            except:
                print("Nans were detected. Please verify the DataFrame...")
                exit()
            ellipse_data(data1, data2, ax, colors[t])

            ax.scatter(data1, data2, s=10, alpha=a)
        ax.legend(classes_list)
        ax.grid()

        handles, labels = ax.get_legend_handles_labels()
        ax.legend(handles, classes_list)

    try:
        plt.tight_layout()
        fig.tight_layout()
    except:
        pass
    plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png",
                dpi=100)
    if show_images:
        plt.show()
    plt.close(fig)
Esempio n. 25
0
    def __init__(self,
                 home_path,
                 geo_ids,
                 unlabelled_geo_ids=None,
                 bad_geo_ids=None,
                 results_folder='results',
                 data_folder='data',
                 destination_folder='annleukemia',
                 dataframes_folder="dataframes",
                 is_translate=True,
                 silent=False):
        """
        # initial_lr because it can change automatically as the epochs go with a SCHEDULER
        # for example : ReduceLROnPlateau reduces the learning rate (lr) when the results are not improved for
        # a number of iterations specified by the user.
        #
        Advantages:
            1- Can start learning very fast and then does fine tuning;
                -   Too high:
                        The accuracy will most likely reach its optimum faster, but might not be
                        as good as with a smaller LR

                -   Too small:
                        The accuracy will most likely reach a better optimum, but might not be quite long (if too low
                        might seem like its not learning. Too low and it might not learn anything at all)

        Pitfalls:
            1- LR reduction too frequent or too large               (same problem as LR too SMALL)
            2- LR reduction not fast enough or not large enough     (same problem as LR too HIGH)


        Examples:
        destination_folder = "/Users/simonpelletier/data/annleukemia"
        initial_lr=1e-3
        init="he_uniform"
        n_epochs=2
        batch_size=128,
        hidden_size = 128
        translate=True
        silent=False

        """
        self.is_translate = is_translate
        self.silent = silent
        self.df = {}
        self.meta_df = {True: None, False: None}
        self.unlabelled_df = {}
        self.files_path = {}
        self.meta_file_path = None
        self.dataframes_folder = None

        # DATASETS IDS
        self.geo_ids = geo_ids

        # FOLDER NAMES

        self.data_folder = data_folder
        self.dataframes_folder = dataframes_folder
        self.destination_folder = destination_folder
        self.results_folder = results_folder
        self.unlabelled_geo_ids = unlabelled_geo_ids
        self.bad_geo_ids = bad_geo_ids

        #if self.unlabelled_geo_ids is not None:
        #    self.labelled_dict = dict(zip(zip(geo_ids, unlabelled_geo_ids),
        #                                  zip([[True] * len(geo_ids)], [[False] * len(unlabelled_geo_ids)])))

        # PATHS
        self.home_path = home_path
        self.data_folder_path = "/".join(
            [self.home_path, self.destination_folder, self.data_folder]) + "/"
        self.results_folder_path = "/".join([
            self.home_path, self.destination_folder, self.results_folder
        ]) + "/"
        self.dataframes_path = "/".join(
            [self.data_folder_path, self.dataframes_folder]) + "/"
        self.translation_dict_path = "/".join(
            [self.results_folder_path, "dictionaries"]) + "/"
        self.soft_path = self.data_folder_path + "/softs/"
        create_missing_folders(self.translation_dict_path)
        self.translation_results = {}
        # Hyperparameters

        #LAST ADDED
        self.meta_destination_folder = None
        self.meta_data_folder_path = None
Esempio n. 26
0
    def merge_datasets(self,
                       labelled,
                       fill_missing=True,
                       load_from_disk=False,
                       meta_destination_folder="meta_pandas_dataframes"):
        """

        :param load_from_disk: 
        :param meta_destination_folder: 
        :param fill_missing: if True, the missing rows will be replaced with 0s for all samples of that dataset.
                The algorythm might be able to do good even without some information. Otherwise, the list might get very small

        from utils import get_example_datasets, create_missing_folders
        fill_missing=True
        geo_ids = ["GSE12417","GSE22845"]
        g = get_example_datasets(geo_ids, home_path="/home/simon/", load_from_disk=True)
        g.get_geo(geo_ids, load_from_disk=True)
        g.merge_datasets(fill_missing=True)

        """

        print("Preparing for merging the selected datasets... labelled:",
              labelled)
        import os
        import pandas as pd
        import numpy as np

        if labelled:
            dataframe = self.df
            geo_ids = list(self.df.keys())
        else:
            dataframe = self.unlabelled_df
            geo_ids = list(self.unlabelled_df.keys())

        self.meta_destination_folder = meta_destination_folder + "_labelled" + str(
            labelled)
        self.meta_data_folder_path = "/".join(
            [self.data_folder_path, meta_destination_folder])
        create_missing_folders(self.meta_data_folder_path)
        meta_filename = "_".join(geo_ids) + ".pickle.npy"

        count = 0
        n_samples_list = [len(dataframe[geo_id].columns) for geo_id in geo_ids]
        total_iteration = -n_samples_list[0]
        meta_file = search_meta_file_name(meta_filename,
                                          list_meta_files=os.listdir(
                                              self.meta_data_folder_path))

        if meta_file is None or load_from_disk is False:
            for i in range(len(n_samples_list)):
                for j in range(i, len(n_samples_list)):
                    total_iteration += n_samples_list[j]
            for g, geo_id in enumerate(geo_ids):
                print("merging file:", g + 1, "/", len(geo_ids))
                if g == 0:
                    meta_df = dataframe[geo_id]
                else:
                    if fill_missing:
                        meta_df = pd.concat((meta_df, dataframe[geo_id]),
                                            axis=1,
                                            sort=True)
                try:
                    assert len(meta_df.index) == len(set(meta_df.index))
                except:
                    print("CONTAINS DUPLICATED ROWNAMES")
                print(meta_df.shape)
            print("Saving files...")
            self.meta_filename = meta_filename
            self.meta_file_path = '/'.join(
                [self.meta_data_folder_path, self.meta_filename])
            self.meta_df[labelled] = meta_df
            self.meta_df[labelled].to_pickle(self.meta_file_path)
            # self.meta_df[labelled].to_csv(self.meta_file_path + ".csv")
        else:
            print("Loading file...")
            self.meta_filename = meta_file
            self.meta_file_path = '/'.join(
                [self.meta_data_folder_path, self.meta_filename])
            self.meta_df[labelled] = pd.read_pickle(self.meta_file_path)
        print("Merged sets loaded.")
        return self.meta_df[labelled]
Esempio n. 27
0
    def build_dataframe(self,
                        geo_id,
                        labelled,
                        bad_example,
                        automatic_attribute,
                        save_to_disk=True):
        """
        The labels are found in the metadata of merged object

        :param save_to_disk: 
        :param geo_id: ID found on NCBI's database
            EXAMPLE: GSE12417 -> found here -> https://www.ncbi.nlm.nih.gov/Geo/query/acc.cgi?acc=GSE12417
        :param save_to_disk (optional):

        EXAMPLE
        g = get_example_datasets(geo_ids = ["GSE12417","GSE22845"], home_path="/Users/simonpelletier/", load_from_disk=True)
        g.get_geo(geo_ids, load_from_disk=load_from_disk)

        """
        create_missing_folders(self.soft_path)
        gse = Geo.GEOparse.get_GEO(geo=geo_id,
                                   destdir=self.soft_path,
                                   silent=self.silent)
        gsm_on_choices = list(gse.gsms[list(gse.gsms.keys())[0]].columns.index)
        gpl_on_choices = list(gse.gpls[list(gse.gpls.keys())[0]].columns.index)

        print(str(len(gsm_on_choices)) + " Choices are available for GSM")
        gsm_on_selection = 0
        # gsm_on_selection = get_user_int(gsm_on_choices)
        gsm_on = gsm_on_choices[gsm_on_selection]
        print(
            str(len(gpl_on_choices)) +
            " Choices are available for GPL. You must select: ")
        print("1 - An annotation for GPL")
        print("2 - (optional) The annotation you want the row names to take")

        gpl_on_selection = 0
        # gpl_on_selection = get_user_int(gpl_on_choices)
        gpl_on = gpl_on_choices[gpl_on_selection]
        val_selection = None
        if automatic_attribute is False:
            val_selection = get_user_int(gpl_on_choices)
        else:
            self.attribute = automatic_attribute
            for attribute in automatic_attribute:
                try:
                    val_selection = gpl_on_choices.index(attribute)
                except:
                    pass

        if val_selection == None:
            exit("Selection not found " + str(automatic_attribute) +
                 str(gpl_on_choices))
        val = gpl_on_choices[val_selection]

        merged_values = gse.merge_and_average(gse.gpls[next(iter(gse.gpls))],
                                              "VALUE",
                                              val,
                                              gpl_on_choices,
                                              gpl_on=gpl_on,
                                              gsm_on=gsm_on)
        merged_values.values[np.isnan(merged_values.values)] = 0

        self.merge_len = merged_values.shape[1]

        if labelled:
            self.df[geo_id] = merged_values
            meta_dict = self.make_metadata_matrix(gse, merged_values)
            labels, meta_dict = self.rename_according_to_metadata(
                gse, meta_dict)

            labels = ["".join(label) for label in labels]

            labels = rename_labels(labels)
            labels = rename(labels)

            self.df[geo_id].columns = labels

            if len(labels) > merged_values.shape[1]:
                prompt = input(
                    "Duplicates were detected. Do you want to keep only the first labels (only say yes if you are sure, "
                    "or the results could be wrong) [y/n]")
                print("Labels", len(labels))
                print("Labels", merged_values.shape)

                if prompt == "y":
                    labels = labels[:merged_values.shape[1]]
                else:
                    exit()
        else:
            self.unlabelled_df[geo_id] = merged_values
            self.unlabelled_df[geo_id].columns = ["no_label"] * len(
                self.unlabelled_df[geo_id].columns)
        if save_to_disk:
            create_missing_folders(path=self.dataframes_path)
            if not bad_example:
                self.files_path[
                    geo_id] = self.dataframes_path + '/' + geo_id + "_labelled" + str(
                        labelled) + '_dataframe'
            else:
                self.files_path[
                    geo_id] = self.dataframes_path + '/' + geo_id + "_labelled" + str(
                        labelled) + '_bad_dataframe'

            print("Saving to " + self.files_path[geo_id])
            if labelled:
                self.df[geo_id].to_pickle(
                    self.files_path[geo_id] +
                    '.pickle.npy')  # Faster to load pickled files
                #self.df[geo_id].to_csv(self.files_path[geo_id] + '.csv') # For vizualisation
            else:
                self.unlabelled_df[geo_id].to_pickle(self.files_path[geo_id] +
                                                     '.pickle.npy')
                #self.unlabelled_df[geo_id].to_csv(self.files_path[geo_id] + '.csv')

        return merged_values
Esempio n. 28
0
def ordination2d(
        data_frame,
        ORD=PCA,
        images_folder_path="/home/simon/results/hebbian_learning_ann/plots/",
        filename="pca",
        a=0.5):
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    type_images_folder_path = images_folder_path + filename + "/"
    create_missing_folders(type_images_folder_path)
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print(
            "The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)"
        )
        return

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)

    pca = ORD(n_components=2)

    data_frame.values[np.isnan(data_frame.values)] = 0
    principalComponents = pca.fit_transform(data_frame.values)
    principalDf = pd.DataFrame(
        data=principalComponents,
        columns=['principal component 1', 'principal component 2'])
    finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component PCA', fontsize=20)
    #colors = get_colors()
    for target in classes_list:
        indicesToKeep = finalDf[0] == target
        data1 = finalDf.loc[indicesToKeep, 'principal component 1']
        data2 = finalDf.loc[indicesToKeep, 'principal component 2']
        try:
            assert np.sum(np.isnan(data1)) == 0 and np.sum(
                np.isnan(data2)) == 0
        except:
            print("Nans were detected. Please verify the DataFrame...")
            exit()
        ellipse_data(data1, data2, ax)

        ax.scatter(data1,
                   data2,
                   s=20,
                   alpha=a,
                   linewidths=0,
                   edgecolors='none')
    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    plt.tight_layout()
    fig.tight_layout()
    fig.savefig(type_images_folder_path + "PCA2d" + filename + ".png")
    plt.close(fig)
Esempio n. 29
0
    def merge_datasets(self, fill_missing=True, is_load_from_disk=True, meta_destination_folder="meta_pandas_dataframes"):
        """

        :param fill_missing: if True, the missing rows will be replaced with 0s for all samples of that dataset.
                The algorythm might be able to do good even without some information. Otherwise, the list might get very small

        from utils import get_example_datasets, create_missing_folders
        fill_missing=True
        geo_ids = ["GSE12417","GSE22845"]
        g = get_example_datasets(geo_ids, home_path="/home/simon/", is_load_from_disk=True)
        g.getGEO(geo_ids, is_load_from_disk=True)
        g.merge_datasets(fill_missing=True)

        """

        print("Preparing for merging the selected datasets...")
        import os
        import pandas as pd
        import numpy as np
        self.meta_destination_folder = meta_destination_folder
        self.meta_data_folder_path = "/".join([self.data_folder_path, meta_destination_folder])
        create_missing_folders(self.meta_data_folder_path)
        meta_filename = "_".join(self.geo_ids)

        count = 0
        n_samples_list = [len(self.df[geo_id].columns) for geo_id in self.geo_ids ]
        total_iteration = -n_samples_list[0]
        meta_file = search_meta_file_name(meta_filename,list_meta_files = os.listdir(self.meta_data_folder_path))

        if meta_file is None or is_load_from_disk is False:
            for i in range(len(n_samples_list)):
                for j in range(i,len(n_samples_list)):
                    total_iteration += n_samples_list[j]
            for g,geo_id in enumerate(self.geo_ids):
                if g == 0:
                    meta_df = self.df[geo_id]
                else:
                    if fill_missing:
                        # FIRST find which rows are not already in the meta_df
                        union_rows = np.union1d(self.df[geo_id].index, meta_df.index)
                        all_cols = np.concatenate((self.df[geo_id].columns, meta_df.columns))
                        new_df = pd.DataFrame(0.00000, index=union_rows, columns=all_cols)
                        in1d_rows = np.in1d(new_df.index,self.df[geo_id].index)
                        in1d_rows_meta = np.in1d(new_df.index, meta_df.index)
                        ind_df_rows = np.array(list(range(len(in1d_rows))))[in1d_rows]
                        ind_meta_rows = np.array(list(range(len(in1d_rows_meta))))[in1d_rows_meta]

                        # SPEED BOTTLENECK
                        # TODO IS this optimal?
                        for c,col in enumerate(self.df[geo_id].columns):
                            count += 1
                            if count % 10 == 0:
                                print("Merge progress {:2.0%}".format(count/total_iteration), end="\r")
                            for r,row in enumerate(ind_df_rows):
                                new_df.values[row, c] = self.df[geo_id].values[r,c]

                        for c,col in enumerate(meta_df.columns):
                            count += 1
                            if count % 10 == 0:
                                print("Merge progress {:2.0%}".format(count/total_iteration), end="\r")
                            for r,row in enumerate(ind_meta_rows):
                                new_df.values[row, c] = meta_df.values[r,c]

                    else:
                        df_intersection_meta = np.in1d(self.df[geo_id].index, meta_df.index)
                        meta_intersection_df = np.in1d(meta_df.index, self.df[geo_id].index)
                        tmp = self.df[geo_id][df_intersection_meta]
                        meta_df = meta_df[meta_intersection_df]
                        new_df = pd.concat([tmp, meta_df], axis=1)

                try:
                    assert len(self.meta_df.index) == len(set(self.meta_df.index))
                except:
                    print("CONTAINS DUPLICATED ROWNAMES")
            self.meta_filename = meta_filename
            self.meta_file_path = '/'.join([self.meta_data_folder_path, self.meta_filename])
            self.meta_df = new_df
            self.meta_df.to_pickle(self.meta_file_path)
        else:
            self.meta_filename = meta_file
            self.meta_file_path = '/'.join([self.meta_data_folder_path, self.meta_filename])
            self.meta_df = pd.read_pickle(self.meta_file_path)
            print("Merged dataset imported from disk.")
def ordination2d(data_frame, ord_type, images_folder_path, dataset_name, epoch, a=0.5, verbose=0, info="none",
                 show_images=True):
    import pandas as pd
    import numpy as np
    try:
        assert type(data_frame) == pd.core.frame.DataFrame
    except:
        print("The type of the data object in pca2d has to be pandas.core.frame.DataFrame. Returning without finishing (no PCA plot was produced)")
        print(type(data_frame))
        exit()
        return
    if type(dataset_name) == list:
        names = [name for name in dataset_name]
        dataset_name = "_".join(names)

    y = np.array(data_frame.columns, dtype=str)
    classes_list = np.unique(y)
    data_frame.values[np.isnan(data_frame.values)] = 0
    ord = None
    ys = False
    if ord_type in ["pca", "PCA"]:
        ys = False
        ord = PCA(n_components=2)
    elif ord_type in ["tsne", "tSNE", "TSNE", "t-sne", "T-SNE", "t-SNE"]:
        ys = False
        ord = TSNE(n_components=2, verbose=verbose)
    elif ord_type in ["lda", "LDA", "flda", "FLDA"]:
        ys = True
        ord = LDA(n_components=2)
    elif ord_type in ["qda", "QDA"]:
        ord = QDA()
        ys = True
    else:
        exit("No ordination of that name is implemented. Exiting...")
    if ys:
        principal_components = ord.fit_transform(np.transpose(data_frame.values))
    else:
        principal_components = ord.fit_transform(np.transpose(data_frame.values), y=data_frame.columns)

    principal_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
    final_df = pd.concat([principal_df, pd.DataFrame(y)], axis=1)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
    ax.set_xlabel('Principal Component 1', fontsize=15)
    ax.set_ylabel('Principal Component 2', fontsize=15)
    ax.set_title('2 component Ordination', fontsize=20)

    colors = cm.viridis(np.linspace(0, 1, len(classes_list)))
    for t, target in enumerate(classes_list):
        indices_to_keep = final_df[0] == target
        indices_to_keep = list(indices_to_keep)
        data1 = final_df.loc[indices_to_keep, 'principal component 1']
        data2 = final_df.loc[indices_to_keep, 'principal component 2']
        try:
            assert np.sum(np.isnan(data1)) == 0 and np.sum(np.isnan(data2)) == 0
        except:
            print("Nans were detected. Please verify the DataFrame...")
            exit()
        ellipse_data(data1, data2, ax, colors[t])

        ax.scatter(data1, data2, s=20, alpha=a, linewidths=0.5, edgecolor='k', c=colors[t])
    ax.legend(classes_list)
    ax.grid()

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, classes_list)

    try:
        plt.tight_layout()
        fig.tight_layout()
    except:
        pass
    type_images_folder_path = "/".join([images_folder_path, str(ord_type), str(dataset_name)]) + "/"
    type_images_folder_path = type_images_folder_path + info + "/"

    create_missing_folders(type_images_folder_path)

    plt.savefig(type_images_folder_path + info + "_" + str(epoch) + ".png", dpi=100)
    if show_images:
        plt.show()
    plt.close(fig)