def preprocessFeatures(Features, i):
    print("----")
    print(
        "Extracting the Principal Components of the Features. Please wait...")
    t1 = time.time()

    # Prepare Features
    ## Transpose the dataset
    data1 = np.transpose(Features)

    ## Performs PCA to reduce the number of Features
    data2 = da.from_array(data1, chunks=data1.shape)
    pca = PCA(n_components=i)  #.shape[1]
    pca.fit(data2)

    ## Get the Total variance that is explained by the selected Principal Components
    var = 0
    for i in pca.explained_variance_ratio_:
        var += i
    print("Total Variance:")
    print(var)

    ## Print the Principal Component Scores
    #print(pca.singular_values_)

    ## Get the Principal Components
    PC = pca.components_
    X = np.transpose(PC)

    print(" ")
    print("PCA Duration: %s minutes" % round((time.time() - t1) / 60, 2))

    return X
def getOptimalPCA(transferLearning):
    data = loadData(transferlearning=transferLearning)

    for n in [200, 400, 1000, 2000, 3000, 4000, 5000]:
        # Set Random Seed
        random.seed(1981)

        X, Y = sampleData(data, n)

        # Prepare Features

        ## Transpose the dataset
        data1 = np.transpose(X)

        ## Performs PCA to reduce the number of Features
        data2 = da.from_array(data1, chunks=data1.shape)
        pca = PCA(n_components=n)  #.shape[1]
        pca.fit(data2)

        # Plot the Cumulative Explained Variance
        print("Transfer Learning = %s" % transferLearning)

        fig = pyplot.figure()
        plotTitle = "Elbow Method for Data Size of %s" % n
        fig.suptitle(plotTitle)
        plt.plot(np.cumsum(pca.explained_variance_ratio_))
        plt.xlabel("Number of Principal Components")
        plt.ylabel("Cumulative Explained Variance")
        pyplot.show()
    def pca(self):
        '''
        Dimensionality reduction
        '''

        X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
        dX = da.from_array(X, chunks=X.shape)
        pca = PCA(n_components=2)
        pca.fit(dX)
        print(pca.explained_variance_ratio_)

        pass
Exemple #4
0
    def sample(self, p):
        """

        Args:
            None

        Returns:
            zarr.array

        """

        group = zarr.open(self.zarr_filepath, mode='r')
        darr1 = da.from_array(group['dummy'], chunks=group['dummy'].chunks)

        with ProgressBar():

            darr1 = darr1[:, darr1.sum(axis=0) > 1]
            darr1 = darr1.compute()
            ncols = darr1.shape[1]
            idx = np.random.randint(0, ncols, int(ncols * p))
            darr1 = darr1[:, idx]

            darr2 = da.from_array(darr1, chunks=(darr1.shape[0], 1000))
            # darr1 = darr1.compute()
            # print(darr1)

            # # darr2 = da.from_array(darr1, chunks=darr1.chunks)

            # #svd_r = dd.TruncatedSVD(components=3, algorithm="tsqr", random_state=42)
            pca = PCA(n_components=3,
                      svd_solver='randomized',
                      random_state=0,
                      iterated_power=4)
            # #pca = PCA(n_components=2, random_state=34, svd_solver='randomized')
            r = pca.fit(darr2)
            print(r)
Exemple #5
0
from dask.distributed import Client
import time
import sys
from dask_ml.decomposition import PCA
import dask.dataframe as dd

client = Client(n_workers=4)
t0 = time.time()
data = dd.read_csv(sys.argv[1], header=None)
pca = PCA(n_components=1, svd_solver='randomized')
pca.fit(data[[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
]].values)
data_trans = pca.transform(data[[
    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21
]].values)
data_trans.compute()
print('Tiempo transcurrido', time.time() - t0)
Exemple #6
0
from dask_ml.decomposition import PCA
import dask.array as da
from multinode import starter
import time

# Example allocation of Dask client
cluster, dash_addr = starter.start_cluster(account_name="GT5DSTC",
                                           job_name="multinode",
                                           n_workers=10,
                                           n_cores=10,
                                           run_time="00:10:00",
                                           mem="60GB",
                                           job_class="S-M")
print("Cluster info: {}".format(cluster))
print("- Dashboard address: {}".format(dash_addr))

# Example run: PCA of 240 GB
# - On login node: 57 s
# - On cluster: 25 s
print("Start example")
x = da.random.random((100000, 300000), chunks=(10000, 10000))
print("- {} GB".format(x.nbytes / 1e9))
pca = PCA(n_components=2)
start = time.time()
pca.fit(x)
end = time.time()
print("- Vars: {}".format(pca.explained_variance_ratio_))
print("- Time: {} s".format(end - start))