Beispiel #1
0
def main():
    infile = "./data/batch/pca_normalized.csv"
    method = 'svdDense'

    # configure a PCA object
    algo = d4p.pca(method=method,
                   resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = read_csv(infile, range(10))
    result2 = algo.compute(data)

    # PCA result objects provide eigenvalues, eigenvectors, means and variances
    assert np.allclose(result1.eigenvalues, result2.eigenvalues)
    assert np.allclose(result1.eigenvectors, result2.eigenvectors)
    assert np.allclose(result1.means, result2.means)
    assert np.allclose(result1.variances, result2.variances)
    assert result1.eigenvalues.shape == (1, data.shape[1])
    assert result1.eigenvectors.shape == (data.shape[1], data.shape[1])
    assert result1.means.shape == (1, data.shape[1])
    assert result1.variances.shape == (1, data.shape[1])

    return result1
Beispiel #2
0
def compute(data, nComponents):
    # configure a PCA object and perform PCA
    pca_algo = d4p.pca(isDeterministic=True, resultsToCompute="mean|variance|eigenvalue")
    pca_res = pca_algo.compute(data)
    # Apply transform with whitening because means and eigenvalues are provided
    pcatrans_algo = d4p.pca_transform(nComponents=nComponents)
    return pcatrans_algo.compute(data, pca_res.eigenvectors, pca_res.dataForTransform)
Beispiel #3
0
def main(readcsv=read_csv, method='svdDense'):
    infile = "./data/batch/pca_normalized.csv"

    # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently
    zscore = d4p.normalization_zscore()
    # configure a PCA object
    algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True,
                   normalization=zscore)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = readcsv(infile)
    result2 = algo.compute(data)

    # PCA result objects provide eigenvalues, eigenvectors, means and variances
    assert np.allclose(result1.eigenvalues, result2.eigenvalues)
    assert np.allclose(result1.eigenvectors, result2.eigenvectors)
    assert np.allclose(result1.means, result2.means)
    assert np.allclose(result1.variances, result2.variances)
    assert result1.eigenvalues.shape == (1, data.shape[1])
    assert result1.eigenvectors.shape == (data.shape[1], data.shape[1])
    assert result1.means.shape == (1, data.shape[1])
    assert result1.variances.shape == (1, data.shape[1])

    return result1
    def pca(self, Data_Path, target, n):
        '''
        daal4py PCA SPMD Mode
        '''

        # Initialize SPMD mode
        d4p.daalinit(nthreads=n)

        # Train setup
        file_path = Data_Path + str(d4p.my_procid() + 1) + ".csv"
        data = pd.read_csv(file_path)
        data = data.drop(target, axis=1)

        # configure a PCA object
        algo = d4p.pca(method='svdDense', distributed=True)

        self.logger.info('Training the PCA in  pydaal SPMD Mode')

        start = time.time()

        result = algo.compute(data)
        self.latency['Parallel_PCA_SPMD_Time'] = time.time() - start

        # result is available on all processes - but we print only on root
        if d4p.my_procid() == 0:
            print("PCA completed", result)
            self.latency["Overall Parallel PCA SPMD Time"] = time.time() - \
                start

        d4p.daalfini()

        self.logger.info('Completed PCA in pydaal SPMD Mode')

        return
Beispiel #5
0
def pca_fit_daal(X, n_components, method):

    if n_components < 1:
        n_components = min(X.shape)

    fptype = getFPType(X)

    centering_algo = normalization_zscore(
        fptype=fptype,
        doScale=False
    )

    pca_algorithm = pca(
        fptype=fptype,
        method=method,
        normalization=centering_algo,
        resultsToCompute='mean|variance|eigenvalue',
        isDeterministic=True,
        nComponents=n_components
    )

    pca_result = pca_algorithm.compute(X)
    eigenvectors = pca_result.eigenvectors
    eigenvalues = pca_result.eigenvalues.ravel()
    singular_values = np.sqrt((X.shape[0] - 1) * eigenvalues)

    return pca_result, eigenvalues, eigenvectors, singular_values
Beispiel #6
0
    def _fit_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        _validate_n_components(n_components, n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)
        centering_algo = daal4py.normalization_zscore(fptype=fpType,
                                                      doScale=False)
        pca_alg = daal4py.pca(fptype=fpType,
                              method='svdDense',
                              normalization=centering_algo,
                              resultsToCompute='mean|variance|eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X)

        self.mean_ = pca_res.means.ravel()
        variances_ = pca_res.variances.ravel()
        components_ = pca_res.eigenvectors
        explained_variance_ = pca_res.eigenvalues.ravel()
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            n_components = \
                _infer_dimension(explained_variance_, n_samples)
        elif 0 < n_components < 1.0:
            n_components = _n_components_from_fraction(
                explained_variance_ratio_, n_components)

        # Compute noise covariance using Probabilistic PCA model
        # The sigma2 maximum likelihood (cf. eq. 12.46)
        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = \
            explained_variance_ratio_[:n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)
Beispiel #7
0
def compute(data):
    # 'normalization' is an optional parameter to PCA; we use z-score which could be configured differently
    zscore = d4p.normalization_zscore()
    # configure a PCA object
    algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True,
                   normalization=zscore)
    return algo.compute(data)
def main(readcsv=read_csv, method='svdDense'):
    dataFileName = "data/batch/pca_transform.csv"
    nComponents = 2

    # read data
    data = readcsv(dataFileName, range(3))

    # configure a PCA object and perform PCA
    pca_algo = d4p.pca(isDeterministic=True, resultsToCompute="mean|variance|eigenvalue")
    pca_res = pca_algo.compute(data)

    # Apply transform with whitening because means and eigenvalues are provided
    pcatrans_algo = d4p.pca_transform(nComponents=nComponents)
    pcatrans_res = pcatrans_algo.compute(data, pca_res.eigenvectors,
                                         pca_res.dataForTransform)
    # pca_transform_result objects provides transformedData

    return (pca_res, pcatrans_res)
def run_pca_daal4py_corr(X, Y):
    algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",
                   isDeterministic=True,
                   method="correlationDense")
    result1 = algo.compute(X)

    pcatrans_algo = d4p.pca_transform(nComponents=X.shape[1] // 2)
    transform = pcatrans_algo.compute(X, result1.eigenvectors,
                                      result1.dataForTransform).transformedData

    res = [
        transform, result1.eigenvalues, result1.eigenvectors, result1.means,
        result1.variances
    ]
    name = [
        "transform", "result1.eigenvalues", "result1.eigenvectors",
        "result1.means", "result1.variances"
    ]
    return res, name
Beispiel #10
0
    def pca(self, data, target):
        '''
        Method for PCA 
        '''

        data = data.drop(target, axis=1)

        # configure a PCA object

        self.logger.info('Training the serial PCA in  pydaal')

        # algo = d4p.pca(resultsToCompute="mean|variance|eigenvalue",nComponents = 10, isDeterministic=True)
        algo = d4p.pca(method='svdDense')
        self.logger.info('Training the PCA in pydaal Batch Mode')
        start = time.time()
        result = algo.compute(data)

        self.latency["Serial_PCA_Batch_Time"] = time.time() - start
        self.logger.info('Completed PCA in pydaal Batch/Serial Mode')

        return result
Beispiel #11
0
#    mpirun -genv DIST_CNC=MPI -n 4 python ./pca_spmd.py

import daal4py as d4p
from numpy import loadtxt, allclose

if __name__ == "__main__":

    # Initialize SPMD mode
    d4p.daalinit(spmd=True)

    # Each process gets its own data
    infile = "./data/distributed/pca_normalized_" + str(d4p.my_procid() +
                                                        1) + ".csv"

    # configure a PCA object to use svd instead of default correlation
    algo = d4p.pca(method='svdDense', distributed=True)

    # let's provide a file directly, not a table/array
    result1 = algo.compute(infile)

    # We can also load the data ourselfs and provide the numpy array
    data = loadtxt(infile, delimiter=',')
    result2 = algo.compute(data)

    # PCA result objects provide eigenvalues, eigenvectors, means and variances
    assert allclose(result1.eigenvalues, result2.eigenvalues)
    assert allclose(result1.eigenvectors, result2.eigenvectors)
    assert result1.means == None and result2.means == None or allclose(
        result1.means, result2.means)
    assert result1.variances == None and result2.variances == None or allclose(
        result1.variances, result2.variances)
Beispiel #12
0
    def _fit_full_daal4py(self, X, n_components):
        n_samples, n_features = X.shape
        n_sf_min = min(n_samples, n_features)

        if n_components == 'mle':
            daal_n_components = n_features
        elif n_components < 1:
            daal_n_components = n_sf_min
        else:
            daal_n_components = n_components

        fpType = getFPType(X)

        covariance_algo = daal4py.covariance(
            fptype=fpType, outputMatrixType='covarianceMatrix')
        covariance_res = covariance_algo.compute(X)

        self.mean_ = covariance_res.mean.ravel()
        covariance = covariance_res.covariance
        variances_ = np.array([covariance[i, i] for i in range(n_features)])

        pca_alg = daal4py.pca(fptype=fpType,
                              method='correlationDense',
                              resultsToCompute='eigenvalue',
                              isDeterministic=True,
                              nComponents=daal_n_components)
        pca_res = pca_alg.compute(X, covariance)

        components_ = pca_res.eigenvectors
        explained_variance_ = np.maximum(pca_res.eigenvalues.ravel(), 0)
        tot_var = explained_variance_.sum()
        explained_variance_ratio_ = explained_variance_ / tot_var

        if n_components == 'mle':
            if sklearn_check_version('0.23'):
                n_components = _infer_dimension(explained_variance_, n_samples)
            else:
                n_components = \
                    _infer_dimension_(explained_variance_, n_samples, n_features)
        elif 0 < n_components < 1.0:
            ratio_cumsum = stable_cumsum(explained_variance_ratio_)
            n_components = np.searchsorted(
                ratio_cumsum, n_components, side='right') + 1

        if n_components < n_sf_min:
            if explained_variance_.shape[0] == n_sf_min:
                self.noise_variance_ = explained_variance_[n_components:].mean(
                )
            else:
                resid_var_ = variances_.sum()
                resid_var_ -= explained_variance_[:n_components].sum()
                self.noise_variance_ = resid_var_ / (n_sf_min - n_components)
        else:
            self.noise_variance_ = 0.

        self.n_samples_, self.n_features_ = n_samples, n_features
        self.components_ = components_[:n_components]
        self.n_components_ = n_components
        self.explained_variance_ = explained_variance_[:n_components]
        self.explained_variance_ratio_ = explained_variance_ratio_[:
                                                                   n_components]
        self.singular_values_ = np.sqrt(
            (n_samples - 1) * self.explained_variance_)