def preprocessFeatures(Features, i): print("----") print( "Extracting the Principal Components of the Features. Please wait...") t1 = time.time() # Prepare Features ## Transpose the dataset data1 = np.transpose(Features) ## Performs PCA to reduce the number of Features data2 = da.from_array(data1, chunks=data1.shape) pca = PCA(n_components=i) #.shape[1] pca.fit(data2) ## Get the Total variance that is explained by the selected Principal Components var = 0 for i in pca.explained_variance_ratio_: var += i print("Total Variance:") print(var) ## Print the Principal Component Scores #print(pca.singular_values_) ## Get the Principal Components PC = pca.components_ X = np.transpose(PC) print(" ") print("PCA Duration: %s minutes" % round((time.time() - t1) / 60, 2)) return X
def getOptimalPCA(transferLearning): data = loadData(transferlearning=transferLearning) for n in [200, 400, 1000, 2000, 3000, 4000, 5000]: # Set Random Seed random.seed(1981) X, Y = sampleData(data, n) # Prepare Features ## Transpose the dataset data1 = np.transpose(X) ## Performs PCA to reduce the number of Features data2 = da.from_array(data1, chunks=data1.shape) pca = PCA(n_components=n) #.shape[1] pca.fit(data2) # Plot the Cumulative Explained Variance print("Transfer Learning = %s" % transferLearning) fig = pyplot.figure() plotTitle = "Elbow Method for Data Size of %s" % n fig.suptitle(plotTitle) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel("Number of Principal Components") plt.ylabel("Cumulative Explained Variance") pyplot.show()
def pca(self): ''' Dimensionality reduction ''' X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) dX = da.from_array(X, chunks=X.shape) pca = PCA(n_components=2) pca.fit(dX) print(pca.explained_variance_ratio_) pass
def sample(self, p): """ Args: None Returns: zarr.array """ group = zarr.open(self.zarr_filepath, mode='r') darr1 = da.from_array(group['dummy'], chunks=group['dummy'].chunks) with ProgressBar(): darr1 = darr1[:, darr1.sum(axis=0) > 1] darr1 = darr1.compute() ncols = darr1.shape[1] idx = np.random.randint(0, ncols, int(ncols * p)) darr1 = darr1[:, idx] darr2 = da.from_array(darr1, chunks=(darr1.shape[0], 1000)) # darr1 = darr1.compute() # print(darr1) # # darr2 = da.from_array(darr1, chunks=darr1.chunks) # #svd_r = dd.TruncatedSVD(components=3, algorithm="tsqr", random_state=42) pca = PCA(n_components=3, svd_solver='randomized', random_state=0, iterated_power=4) # #pca = PCA(n_components=2, random_state=34, svd_solver='randomized') r = pca.fit(darr2) print(r)
from dask.distributed import Client import time import sys from dask_ml.decomposition import PCA import dask.dataframe as dd client = Client(n_workers=4) t0 = time.time() data = dd.read_csv(sys.argv[1], header=None) pca = PCA(n_components=1, svd_solver='randomized') pca.fit(data[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values) data_trans = pca.transform(data[[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21 ]].values) data_trans.compute() print('Tiempo transcurrido', time.time() - t0)
from dask_ml.decomposition import PCA import dask.array as da from multinode import starter import time # Example allocation of Dask client cluster, dash_addr = starter.start_cluster(account_name="GT5DSTC", job_name="multinode", n_workers=10, n_cores=10, run_time="00:10:00", mem="60GB", job_class="S-M") print("Cluster info: {}".format(cluster)) print("- Dashboard address: {}".format(dash_addr)) # Example run: PCA of 240 GB # - On login node: 57 s # - On cluster: 25 s print("Start example") x = da.random.random((100000, 300000), chunks=(10000, 10000)) print("- {} GB".format(x.nbytes / 1e9)) pca = PCA(n_components=2) start = time.time() pca.fit(x) end = time.time() print("- Vars: {}".format(pca.explained_variance_ratio_)) print("- Time: {} s".format(end - start))