def plot_latent(self, cur_epoch=''):
        # Generating latent space
        print('Generating latent space ...')
        latent_en = self.encode(self.data_plot.x)

        pca = PCA(n_components=2)
        latent_pca = latent_en if latent_en.shape[1] == 2 else  latent_en[:, 0:2] if latent_en.shape[1]==3 else pca.fit_transform(latent_en)

        print('Latent space dimensions: {}'.format(latent_pca.shape))
        print('Plotting latent space ...')
        latent_space = self.config.log_dir + '/latent2d/{} latent epoch {}.jpg'.format(self.config.log_dir.split('/')[-1:][0],
                                                                          cur_epoch)
        self.latent_space_files.append(latent_space)
        plot_dataset(latent_pca.compute(), y=self.data_plot.labels, save=latent_space)

        if latent_en.shape[1] >= 3:
            pca = PCA(n_components=3)
            latent_pca = latent_en if latent_en.shape[1]==3 else pca.fit_transform(latent_en)

            print('latent space dimensions: {}'.format(latent_pca.shape))
            print('Plotting latent space ...')
            latent_space = self.config.log_dir + '/latent3d/{} latent_3d epoch {}.jpg'.format(self.config.log_dir.split('/')[-1:][0],
                                                                                 cur_epoch)
            self.latent_space3d_files.append(latent_space)
            plot_dataset3d(latent_pca.compute(), y=self.data_plot.labels, save=latent_space)

        del latent_pca, latent_en
        gc.collect()
def preprocessFeatures(Features, i):
    print("----")
    print(
        "Extracting the Principal Components of the Features. Please wait...")
    t1 = time.time()

    # Prepare Features
    ## Transpose the dataset
    data1 = np.transpose(Features)

    ## Performs PCA to reduce the number of Features
    data2 = da.from_array(data1, chunks=data1.shape)
    pca = PCA(n_components=i)  #.shape[1]
    pca.fit(data2)

    ## Get the Total variance that is explained by the selected Principal Components
    var = 0
    for i in pca.explained_variance_ratio_:
        var += i
    print("Total Variance:")
    print(var)

    ## Print the Principal Component Scores
    #print(pca.singular_values_)

    ## Get the Principal Components
    PC = pca.components_
    X = np.transpose(PC)

    print(" ")
    print("PCA Duration: %s minutes" % round((time.time() - t1) / 60, 2))

    return X
Ejemplo n.º 3
0
def getOptimalPCA(transferLearning):
    data = loadData(transferlearning=transferLearning)

    for n in [200, 400, 1000, 2000, 3000, 4000, 5000]:
        # Set Random Seed
        random.seed(1981)

        X, Y = sampleData(data, n)

        # Prepare Features

        ## Transpose the dataset
        data1 = np.transpose(X)

        ## Performs PCA to reduce the number of Features
        data2 = da.from_array(data1, chunks=data1.shape)
        pca = PCA(n_components=n)  #.shape[1]
        pca.fit(data2)

        # Plot the Cumulative Explained Variance
        print("Transfer Learning = %s" % transferLearning)

        fig = pyplot.figure()
        plotTitle = "Elbow Method for Data Size of %s" % n
        fig.suptitle(plotTitle)
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel("Number of Principal Components")
        plt.ylabel("Cumulative Explained Variance")
        pyplot.show()
Ejemplo n.º 4
0
    def do_pca(g, n_comp):
        """
        Perform a PCA on the genetic array and return n_comp of it

        :param g: Genotype array
        :param n_comp: Number of components sought
        :return: components array
        """
        pca = PCA(n_components=n_comp)
        pca = pca.fit_transform(g)
        return pca
Ejemplo n.º 5
0
    def pca(self):
        '''
        Dimensionality reduction
        '''

        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        dX = da.from_array(X, chunks=X.shape)
        pca = PCA(n_components=2)
        pca.fit(dX)
        print(pca.explained_variance_ratio_)

        pass
Ejemplo n.º 6
0
    def do_pca(g, n_comp):
        """
        Perform a PCA on the genetic array and return n_comp of it

        :param g: Genotype array
        :param n_comp: Number of components sought
        :return: components array
        """
        cache = Chest(available_memory=available_memory, path=os.getcwd())
        pca = PCA(n_components=n_comp)
        pca = pca.fit_transform(g)
        return pca.compute(cache=cache)
Ejemplo n.º 7
0
    def sample(self, p):
        """

        Args:
            None

        Returns:
            zarr.array

        """

        group = zarr.open(self.zarr_filepath, mode='r')
        darr1 = da.from_array(group['dummy'], chunks=group['dummy'].chunks)

        with ProgressBar():

            darr1 = darr1[:, darr1.sum(axis=0) > 1]
            darr1 = darr1.compute()
            ncols = darr1.shape[1]
            idx = np.random.randint(0, ncols, int(ncols * p))
            darr1 = darr1[:, idx]

            darr2 = da.from_array(darr1, chunks=(darr1.shape[0], 1000))
            # darr1 = darr1.compute()
            # print(darr1)

            # # darr2 = da.from_array(darr1, chunks=darr1.chunks)

            # #svd_r = dd.TruncatedSVD(components=3, algorithm="tsqr", random_state=42)
            pca = PCA(n_components=3,
                      svd_solver='randomized',
                      random_state=0,
                      iterated_power=4)
            # #pca = PCA(n_components=2, random_state=34, svd_solver='randomized')
            r = pca.fit(darr2)
            print(r)
Ejemplo n.º 8
0
    def generate_samples(self, data, session, cur_epoch=''):
        # Generating W space
        print('Generating W space ...')
        w_en = self.encode(data.x)

        pca = PCA(n_components=2)
        W_pca = pca.fit_transform(w_en)
        print('W space dimensions: {}'.format(W_pca.shape))
        print('Ploting W space ...')
        w_space = self.summary_dir + '/{} W space in epoch {}.jpg'.format(
            self.summary_dir.split('/')[-1:][0], cur_epoch)
        self.w_space_files.append(w_space)

        plot_dataset(W_pca.compute(), y=data.labels, save=w_space)

        pca = PCA(n_components=3)
        W_pca = pca.fit_transform(w_en)
        print('W space dimensions: {}'.format(W_pca.shape))
        print('Ploting W space ...')
        w_space = self.summary_dir + '/{} W space 3d in epoch {}.jpg'.format(
            self.summary_dir.split('/')[-1:][0], cur_epoch)
        self.w_space3d_files.append(w_space)
        plot_dataset3d(W_pca.compute(), y=data.labels, save=w_space)

        del W_pca, w_en
        gc.collect()

        # Generating Samples
        print('Generating Samples ...')

        x_recons_l = self.reconst(data.samples)
        recons_file = self.summary_dir + '/{} samples generation in epoch {}.jpg'.format(
            self.summary_dir.split('/')[-1:][0], cur_epoch)
        self.recons_files.append(recons_file)
        plot_samples(x_recons_l, scale=10, save=recons_file)

        del x_recons_l
        gc.collect()
    def plot_latent(self, cur_epoch=''):
        # Generating latent space
        if self.latent_data is None:
            self.generate_latent(self.data_train, self.session,
                                 self.config.ntrain_batches)

        pca = PCA(n_components=2)
        latent_pca = self.latent_data['latent'] if self.latent_data['latent'].shape[1] == 2 \
                                    else self.latent_data['latent'][:, 0:2] if self.latent_data['latent'].shape[1]==3 \
                                                                else pca.fit_transform(self.latent_data['latent'])

        print('Latent space dimensions: {}'.format(latent_pca.shape))
        print('Plotting latent space ...')
        latent_space = self.config.log_dir + '/latent2d/{} latent epoch {}.jpg'.format(
            self.config.log_dir.split('/')[-1:][0], cur_epoch)
        plot_dataset(
            latent_pca.compute(),
            y=self.latent_data['label'][:,
                                        self.latent_data['y_index']].compute(),
            save=latent_space)

        if self.latent_data['latent'].shape[1] >= 3:
            pca = PCA(n_components=3)
            latent_pca = self.latent_data['latent'] if self.latent_data['latent'].shape[1]==3 \
                                        else pca.fit_transform(self.latent_data['latent'])

            print('latent space dimensions: {}'.format(latent_pca.shape))
            print('Plotting latent space ...')
            latent_space = self.config.log_dir + '/latent3d/{} latent_3d epoch {}.jpg'.format(
                self.config.log_dir.split('/')[-1:][0], cur_epoch)
            plot_dataset3d(latent_pca.compute(),
                           y=self.latent_data['label']
                           [:, self.latent_data['y_index']].compute(),
                           save=latent_space)

        del latent_pca
        gc.collect()
Ejemplo n.º 10
0
from dask.distributed import Client
import time
import sys
from dask_ml.decomposition import PCA
import dask.dataframe as dd

client = Client(n_workers=4)
t0 = time.time()
data = dd.read_csv(sys.argv[1], header=None)
pca = PCA(n_components=1, svd_solver='randomized')
pca.fit(data[[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
]].values)
data_trans = pca.transform(data[[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
]].values)
data_trans.compute()
print('Tiempo transcurrido', time.time() - t0)
fullIVs = dset[:, x_slice,
               y_slice].rechunk(chunks=(-1, 10 * coarsen, 10 * coarsen))
coarseIVs = IVs[:, ::coarsen, ::coarsen].reshape(
    (IVs.shape[0], -1)).T.persist()
IVs
# -

# Get metadata from netCDF file for plotting
EGY = xdata.Energy_set
multiplier = xdata.multiplier
plt.plot(EGY, multiplier)

# ## Principal Component Analysis
# To reduce the data to a reasonable number of dimensions, we use a pipeline of a standardscaler and PCA:

pca = PCA(n_components=dimensions, whiten=True, random_state=4)
pipe = make_pipeline(StandardScaler(), pca)
pipe_names = '_'.join(pipe.named_steps.keys())
pipe.fit(
    coarseIVs)  # Fit the standard scaler and PCA vectors to the coarsened data

plt.figure(figsize=[3.5, 3.5])
scree = np.concatenate([[0],
                        pipe.named_steps['pca'].explained_variance_ratio_])
plt.scatter(np.arange(dimensions) + 1,
            scree[1:],
            label='relative',
            facecolors='none',
            edgecolors=colors,
            linewidth=2)
plt.scatter(np.arange(dimensions + 1),
Ejemplo n.º 12
0
def pca(dataset: DataSet, num_components: int):
    pca = PCA(n_components=num_components)
    reduced = pca.fit_transform(dataset.feature_vector)
    components = pca.components_

    return DataSet(reduced), components
Ejemplo n.º 13
0
from dask_ml.decomposition import PCA
import dask.array as da
from multinode import starter
import time

# Example allocation of Dask client
cluster, dash_addr = starter.start_cluster(account_name="GT5DSTC",
                                           job_name="multinode",
                                           n_workers=10,
                                           n_cores=10,
                                           run_time="00:10:00",
                                           mem="60GB",
                                           job_class="S-M")
print("Cluster info: {}".format(cluster))
print("- Dashboard address: {}".format(dash_addr))

# Example run: PCA of 240 GB
# - On login node: 57 s
# - On cluster: 25 s
print("Start example")
x = da.random.random((100000, 300000), chunks=(10000, 10000))
print("- {} GB".format(x.nbytes / 1e9))
pca = PCA(n_components=2)
start = time.time()
pca.fit(x)
end = time.time()
print("- Vars: {}".format(pca.explained_variance_ratio_))
print("- Time: {} s".format(end - start))