def plot_latent(self, cur_epoch=''): # Generating latent space print('Generating latent space ...') latent_en = self.encode(self.data_plot.x) pca = PCA(n_components=2) latent_pca = latent_en if latent_en.shape[1] == 2 else latent_en[:, 0:2] if latent_en.shape[1]==3 else pca.fit_transform(latent_en) print('Latent space dimensions: {}'.format(latent_pca.shape)) print('Plotting latent space ...') latent_space = self.config.log_dir + '/latent2d/{} latent epoch {}.jpg'.format(self.config.log_dir.split('/')[-1:][0], cur_epoch) self.latent_space_files.append(latent_space) plot_dataset(latent_pca.compute(), y=self.data_plot.labels, save=latent_space) if latent_en.shape[1] >= 3: pca = PCA(n_components=3) latent_pca = latent_en if latent_en.shape[1]==3 else pca.fit_transform(latent_en) print('latent space dimensions: {}'.format(latent_pca.shape)) print('Plotting latent space ...') latent_space = self.config.log_dir + '/latent3d/{} latent_3d epoch {}.jpg'.format(self.config.log_dir.split('/')[-1:][0], cur_epoch) self.latent_space3d_files.append(latent_space) plot_dataset3d(latent_pca.compute(), y=self.data_plot.labels, save=latent_space) del latent_pca, latent_en gc.collect()
def preprocessFeatures(Features, i): print("----") print( "Extracting the Principal Components of the Features. Please wait...") t1 = time.time() # Prepare Features ## Transpose the dataset data1 = np.transpose(Features) ## Performs PCA to reduce the number of Features data2 = da.from_array(data1, chunks=data1.shape) pca = PCA(n_components=i) #.shape[1] pca.fit(data2) ## Get the Total variance that is explained by the selected Principal Components var = 0 for i in pca.explained_variance_ratio_: var += i print("Total Variance:") print(var) ## Print the Principal Component Scores #print(pca.singular_values_) ## Get the Principal Components PC = pca.components_ X = np.transpose(PC) print(" ") print("PCA Duration: %s minutes" % round((time.time() - t1) / 60, 2)) return X
def getOptimalPCA(transferLearning): data = loadData(transferlearning=transferLearning) for n in [200, 400, 1000, 2000, 3000, 4000, 5000]: # Set Random Seed random.seed(1981) X, Y = sampleData(data, n) # Prepare Features ## Transpose the dataset data1 = np.transpose(X) ## Performs PCA to reduce the number of Features data2 = da.from_array(data1, chunks=data1.shape) pca = PCA(n_components=n) #.shape[1] pca.fit(data2) # Plot the Cumulative Explained Variance print("Transfer Learning = %s" % transferLearning) fig = pyplot.figure() plotTitle = "Elbow Method for Data Size of %s" % n fig.suptitle(plotTitle) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel("Number of Principal Components") plt.ylabel("Cumulative Explained Variance") pyplot.show()
def do_pca(g, n_comp): """ Perform a PCA on the genetic array and return n_comp of it :param g: Genotype array :param n_comp: Number of components sought :return: components array """ pca = PCA(n_components=n_comp) pca = pca.fit_transform(g) return pca
def pca(self): ''' Dimensionality reduction ''' X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) dX = da.from_array(X, chunks=X.shape) pca = PCA(n_components=2) pca.fit(dX) print(pca.explained_variance_ratio_) pass
def do_pca(g, n_comp): """ Perform a PCA on the genetic array and return n_comp of it :param g: Genotype array :param n_comp: Number of components sought :return: components array """ cache = Chest(available_memory=available_memory, path=os.getcwd()) pca = PCA(n_components=n_comp) pca = pca.fit_transform(g) return pca.compute(cache=cache)
def sample(self, p): """ Args: None Returns: zarr.array """ group = zarr.open(self.zarr_filepath, mode='r') darr1 = da.from_array(group['dummy'], chunks=group['dummy'].chunks) with ProgressBar(): darr1 = darr1[:, darr1.sum(axis=0) > 1] darr1 = darr1.compute() ncols = darr1.shape[1] idx = np.random.randint(0, ncols, int(ncols * p)) darr1 = darr1[:, idx] darr2 = da.from_array(darr1, chunks=(darr1.shape[0], 1000)) # darr1 = darr1.compute() # print(darr1) # # darr2 = da.from_array(darr1, chunks=darr1.chunks) # #svd_r = dd.TruncatedSVD(components=3, algorithm="tsqr", random_state=42) pca = PCA(n_components=3, svd_solver='randomized', random_state=0, iterated_power=4) # #pca = PCA(n_components=2, random_state=34, svd_solver='randomized') r = pca.fit(darr2) print(r)
def generate_samples(self, data, session, cur_epoch=''): # Generating W space print('Generating W space ...') w_en = self.encode(data.x) pca = PCA(n_components=2) W_pca = pca.fit_transform(w_en) print('W space dimensions: {}'.format(W_pca.shape)) print('Ploting W space ...') w_space = self.summary_dir + '/{} W space in epoch {}.jpg'.format( self.summary_dir.split('/')[-1:][0], cur_epoch) self.w_space_files.append(w_space) plot_dataset(W_pca.compute(), y=data.labels, save=w_space) pca = PCA(n_components=3) W_pca = pca.fit_transform(w_en) print('W space dimensions: {}'.format(W_pca.shape)) print('Ploting W space ...') w_space = self.summary_dir + '/{} W space 3d in epoch {}.jpg'.format( self.summary_dir.split('/')[-1:][0], cur_epoch) self.w_space3d_files.append(w_space) plot_dataset3d(W_pca.compute(), y=data.labels, save=w_space) del W_pca, w_en gc.collect() # Generating Samples print('Generating Samples ...') x_recons_l = self.reconst(data.samples) recons_file = self.summary_dir + '/{} samples generation in epoch {}.jpg'.format( self.summary_dir.split('/')[-1:][0], cur_epoch) self.recons_files.append(recons_file) plot_samples(x_recons_l, scale=10, save=recons_file) del x_recons_l gc.collect()
def plot_latent(self, cur_epoch=''): # Generating latent space if self.latent_data is None: self.generate_latent(self.data_train, self.session, self.config.ntrain_batches) pca = PCA(n_components=2) latent_pca = self.latent_data['latent'] if self.latent_data['latent'].shape[1] == 2 \ else self.latent_data['latent'][:, 0:2] if self.latent_data['latent'].shape[1]==3 \ else pca.fit_transform(self.latent_data['latent']) print('Latent space dimensions: {}'.format(latent_pca.shape)) print('Plotting latent space ...') latent_space = self.config.log_dir + '/latent2d/{} latent epoch {}.jpg'.format( self.config.log_dir.split('/')[-1:][0], cur_epoch) plot_dataset( latent_pca.compute(), y=self.latent_data['label'][:, self.latent_data['y_index']].compute(), save=latent_space) if self.latent_data['latent'].shape[1] >= 3: pca = PCA(n_components=3) latent_pca = self.latent_data['latent'] if self.latent_data['latent'].shape[1]==3 \ else pca.fit_transform(self.latent_data['latent']) print('latent space dimensions: {}'.format(latent_pca.shape)) print('Plotting latent space ...') latent_space = self.config.log_dir + '/latent3d/{} latent_3d epoch {}.jpg'.format( self.config.log_dir.split('/')[-1:][0], cur_epoch) plot_dataset3d(latent_pca.compute(), y=self.latent_data['label'] [:, self.latent_data['y_index']].compute(), save=latent_space) del latent_pca gc.collect()
from dask.distributed import Client import time import sys from dask_ml.decomposition import PCA import dask.dataframe as dd client = Client(n_workers=4) t0 = time.time() data = dd.read_csv(sys.argv[1], header=None) pca = PCA(n_components=1, svd_solver='randomized') pca.fit(data[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values) data_trans = pca.transform(data[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values) data_trans.compute() print('Tiempo transcurrido', time.time() - t0)
fullIVs = dset[:, x_slice, y_slice].rechunk(chunks=(-1, 10 * coarsen, 10 * coarsen)) coarseIVs = IVs[:, ::coarsen, ::coarsen].reshape( (IVs.shape[0], -1)).T.persist() IVs # - # Get metadata from netCDF file for plotting EGY = xdata.Energy_set multiplier = xdata.multiplier plt.plot(EGY, multiplier) # ## Principal Component Analysis # To reduce the data to a reasonable number of dimensions, we use a pipeline of a standardscaler and PCA: pca = PCA(n_components=dimensions, whiten=True, random_state=4) pipe = make_pipeline(StandardScaler(), pca) pipe_names = '_'.join(pipe.named_steps.keys()) pipe.fit( coarseIVs) # Fit the standard scaler and PCA vectors to the coarsened data plt.figure(figsize=[3.5, 3.5]) scree = np.concatenate([[0], pipe.named_steps['pca'].explained_variance_ratio_]) plt.scatter(np.arange(dimensions) + 1, scree[1:], label='relative', facecolors='none', edgecolors=colors, linewidth=2) plt.scatter(np.arange(dimensions + 1),
def pca(dataset: DataSet, num_components: int): pca = PCA(n_components=num_components) reduced = pca.fit_transform(dataset.feature_vector) components = pca.components_ return DataSet(reduced), components
from dask_ml.decomposition import PCA import dask.array as da from multinode import starter import time # Example allocation of Dask client cluster, dash_addr = starter.start_cluster(account_name="GT5DSTC", job_name="multinode", n_workers=10, n_cores=10, run_time="00:10:00", mem="60GB", job_class="S-M") print("Cluster info: {}".format(cluster)) print("- Dashboard address: {}".format(dash_addr)) # Example run: PCA of 240 GB # - On login node: 57 s # - On cluster: 25 s print("Start example") x = da.random.random((100000, 300000), chunks=(10000, 10000)) print("- {} GB".format(x.nbytes / 1e9)) pca = PCA(n_components=2) start = time.time() pca.fit(x) end = time.time() print("- Vars: {}".format(pca.explained_variance_ratio_)) print("- Time: {} s".format(end - start))