Esempio n. 1
0
 def _compute_covariance(self):
     """Computes the covariance matrix for each gaussian kernel using
     covariance_factor
     """
     self.factor = self.covariance_factor()
     self.covariance = atleast_2d(stats.cov(self.dataset, rowvar=1) * 
         self.factor * self.factor)
     self.inv_cov = linalg.inv(self.covariance)
Esempio n. 2
0
 def _compute_covariance(self):
     """Computes the covariance matrix for each Gaussian kernel using
     covariance_factor
     """
     self.factor = self.covariance_factor()
     self.covariance = atleast_2d(
         stats.cov(self.dataset, rowvar=1) * self.factor * self.factor)
     self.inv_cov = linalg.inv(self.covariance)
     self._norm_factor = sqrt(linalg.det(2 * pi * self.covariance)) * self.n
Esempio n. 3
0
def scree_plot(arr : np.ndarray, figsize=(18, 8)): 
    """Generates a scree plot for a given data array. 
    
    Parameters
    ----------
    arr : np.ndarray
        matrix in (samples x features) form 
    figsize : tuple, optional
        by default (18, 8)
    """
    eigvals = np.linalg.eigs(stats.cov(arr))
    plt.figure(figsize=figsize)
    plt.title('Eigenvalue versus magnitude (Scree plot)')
    sns.barplot(x=np.arange(len(eigvals)), y=eigvals, color='blue', saturation=.3)
    plt.ylabel('Magnitude') 
    plt.xlabel('Eigenvalue index')
    plt.show()
Esempio n. 4
0
    def __init__(self, dataset, bw_method=None):
        self.dataset = np.atleast_2d(dataset)
        if not np.array(self.dataset).size > 1:
            raise ValueError("`dataset` input should have multiple elements.")

        self.dim, self.num_dp = np.array(self.dataset).shape
        isString = isinstance(bw_method, str)

        if bw_method is None:
            pass
        elif (isString and bw_method == 'scott'):
            self.covariance_factor = self.scotts_factor
        elif (isString and bw_method == 'silverman'):
            self.covariance_factor = self.silverman_factor
        elif (np.isscalar(bw_method) and not isString):
            self._bw_method = 'use constant'
            self.covariance_factor = lambda: bw_method
        elif callable(bw_method):
            self._bw_method = bw_method
            self.covariance_factor = lambda: self._bw_method(self)
        else:
            raise ValueError("`bw_method` should be 'scott', 'silverman', a "
                             "scalar or a callable")

        # Computes the covariance matrix for each Gaussian kernel using
        # covariance_factor().

        self.factor = self.covariance_factor()
        # Cache covariance and inverse covariance of the data
        if not hasattr(self, '_data_inv_cov'):
            self.data_covariance = np.atleast_2d(
                stats.cov(self.dataset, rowvar=1, bias=False))
            self.data_inv_cov = linalg.inv(self.data_covariance)

        self.covariance = self.data_covariance * self.factor**2
        self.inv_cov = self.data_inv_cov / self.factor**2
        self.norm_factor = np.sqrt(linalg.det(
            2 * np.pi * self.covariance)) * self.num_dp
Esempio n. 5
0
def pca(data : np.ndarray, dim : int = 2, verbose : bool = False, class_identity=None, return_reconstruction=False) -> np.ndarray:
    """Principal components analysis to form a (dim) dimensional approximation 
    of a given dataset. 
    
    Parameters
    ----------
    data : np.ndarray
        matrix in (samples x features) form 
    dim : int, optional
        approximating dimension, must be less than the data dimension, by default 2
    verbose : bool, optional 
        whether to display reconstruction diagnostics and a scree plot 
    Returns
    -------
    np.ndarray
        dim-dimensional representation of the input data 
    """
    if np.sum(np.isnan(data).astype(int)) > 0: 
        print("missing data detected. consider using pca_missing_data")
        raise(NotImplemented)

    reconstruction=None
    n, p = data.shape 
    assert 0 < dim < p, "projection dimension must be a positive integer less than the number of data dimensions {}".format(p)
    
    # center the data 
    de_meaned = stats.center(data) 

    # optimize for high dimensionality data
    if p > n: 
        print('High dimensional data detected, optimizing...')

        # [email protected] is (n x n) which we're assuming is actually smaller than (p x p) in this case
        X = de_meaned
        eigvals, X_eigvecs = np.linalg.eig(X @ X.T) 

        # plop the eigenvalues into a diagonal matrix lambda 
        lam = np.diag(np.real(eigvals))

        # compute the eigenvecs
        # transformed_E = X @ X_eigvecs
        lam_inv = np.linalg.inv(lam)
        eigvecs = (X.T @ X_eigvecs @ lam_inv)
    
    else: 
        # generate the data covariance matrix 
        sample_covariance = stats.cov(de_meaned) 

        # extract spectrum of the covariance matrix (its set of eigenvalues and eigenvectors)
        eigvals, eigvecs = np.linalg.eig(sample_covariance)
    
    # arrange the eigenvectors so that they are ordered according to the magnitude (large->small) of their corresponding eigenvalue
    idx = eigvals.argsort()[::-1]   
    eigvals = np.real(np.array(eigvals[idx]))
    eigvecs = np.real(np.array(eigvecs[:, idx]))
    E = eigvecs[:, :dim]
    
    # compute low dimensional representation.  
    Y = np.array(data @ E)

    if verbose is True:
        # scree plot  
        plt.figure(1, figsize=(18, 8))
        plt.title('Eigenvalue versus magnitude (Scree plot)')
        sns.barplot(x=np.arange(len(eigvals)), y=eigvals, color='blue', saturation=.3)
        plt.ylabel('Magnitude') 
        plt.xlabel('Eigenvalue index')
        plt.show()

        # reconstruction 
        reconstruction = Y @ E.T
        assert reconstruction.shape == data.shape
        reconstruction_params = [data, reconstruction]
        labels = ['Original data', 'Reconstruction']

        fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 8))
        for i, ax in zip(range(2), axes.flat):
            # extract given params 
            given_params = reconstruction_params[i]
            ax.set_title(labels[i])
            sns.heatmap(given_params, cmap='Blues_r', alpha=0.65, annot=False, cbar=False, xticklabels=False, yticklabels=False, ax=ax)

        fig.tight_layout()
        plt.show()

        # print("Reconstruction error: {}".format(round(np.sum(np.linalg.norm(data - reconstruction, axis=1)), 2)))

        # plot 2D projection 
        if labels:
            to_plot = Y
            if dim != 2:  
                to_plot = pca(data, dim= 2, verbose=False)
            plt.figure(figsize=(10, 8))
            plt.title("2D Data Representation")
            sns.scatterplot(x=to_plot[:, 0], y=to_plot[:, 1], hue=class_identity, legend='full', \
                                palette=sns.color_palette('bright', n_colors=len(np.unique(class_identity))))
            plt.show()
        else: 
            to_plot = Y
            if dim != 2:  
                to_plot = pca(data, dim= 2, verbose=False)
            plt.figure(figsize=(10, 8))
            plt.title("2D Data Representation")
            sns.scatterplot(x=to_plot[:, 0], y=to_plot[:, 1])
            plt.show()
            
        # Hinton diagram for eigenvalue matrix (slow af though, only use for small matrices)
        if lam.shape[0] < 20: 
            hinton(lam)
            plt.show()

    if return_reconstruction is True and reconstruction is not None: 
        return Y, reconstruction

    elif return_reconstruction is True: 
        reconstruction = Y @ E.T
        return Y, reconstruction

    else: 
        return Y
plt.savefig(fout + 'sstbudget_anom_ts.png')

# In[ ]:

T_var = T_anom.var(dim='time')
get_ipython().run_line_magic('time', 'T_var.load()')
#%time T_var.persist()

# In[ ]:

tendH_anom = tendH_anom / c_o

# In[ ]:

#tendH_anom = tendH_anom.transpose('time','face', 'k', 'j', 'i')
cov_adv = st.cov(tendH_anom, C_adv_anom)
cov_dif = st.cov(tendH_anom, C_dif_anom)
cov_forc = st.cov(tendH_anom, C_forc_anom)

# In[ ]:

cov_adv.nbytes / 1e9

# In[ ]:

get_ipython().run_line_magic('time', 'cov_adv.load()')
get_ipython().run_line_magic('time', 'cov_dif.load()')
get_ipython().run_line_magic('time', 'cov_forc.load()')

# In[ ]:
Esempio n. 7
0
def pca(data,num_components=2):
    U,s,Vh = np.linalg.svd(cov(data))
    return Vh.T[:,:num_components]
Esempio n. 8
0
def pca(data, num_components=2):
    U, s, Vh = np.linalg.svd(cov(data))
    return Vh.T[:, :num_components]