Beispiel #1
0
    def test_copy(self):
        XX = np.copy(self.X)

        pca = prince.PCA(n_components=2, copy=True)
        pca.fit(XX)
        np.testing.assert_array_equal(self.X, XX)

        pca = prince.PCA(n_components=2, copy=False)
        pca.fit(XX)
        self.assertRaises(AssertionError, np.testing.assert_array_equal, self.X, XX)
    def acpReduction(self):

        pca = prince.PCA(self.data[list(self.data.columns[0:8])],
                         n_components=-1)
        pca.plot_rows(ellipse_fill=True)
        pca.plot_correlation_circle()
        """pca = PCA(n_components=2)
Beispiel #3
0
 def pca_prince(self, df, cols, color_labels=None):
     """
     cols:  a list of numerical column names
     color_labels: categorical label column
     Prince's implementation of 2D pca with visualiztion
     """
     pca = prince.PCA(n_components=2,
                      n_iter=3,
                      rescale_with_mean=True,
                      rescale_with_std=True,
                      copy=True,
                      check_input=True,
                      engine='auto')
     pca = pca.fit(df[cols])
     fig, ax = plt.subplots(figsize=(10, 10))
     pca.plot_row_coordinates(df[cols],
                              ax=ax,
                              x_component=0,
                              y_component=1,
                              labels=None,
                              color_labels=df[color_labels],
                              ellipse_outline=False,
                              ellipse_fill=True,
                              show_points=True)
     fig.show()
     return pca
Beispiel #4
0
    def __init__(self, data, *args, **kws):
        '''
        Fit a PCA using prince module 

        Inputs:
            data    : pandas DataFrame with the variables to compute the PCAs
            ncomp   : number of PCA components. Default equals the number of 
                      variables
            niter   : The number of iterations used for computing the SVD.
            inplace : Whether to perform the computations inplace or not. 
                        Default is True
            seed    : seed for the random state
            invert  : list with indexes of the principal components to 
                      invert the axis (multiply by -1)
        Outputs:
            PCA object with fitted model and scores
        '''
        # arguments 
        ncomp=kws.get("ncomp", data.shape[1])
        niter=kws.get('niter', 10)
        inplace=kws.get('inplace', True)
        seed=kws.get('seed', 666)
        invert=kws.get('invert', None)
        # pca
        model = prince.PCA(
            n_components=ncomp,
            n_iter=niter,
            rescale_with_mean=True,
            rescale_with_std=True,
            copy=inplace,
            check_input=True,
            engine='sklearn',
            random_state=seed
        )
        self.data=data
        self.invert=invert
        self.fit = model.fit(data)
        self.scores = self.fit.fit_transform(data)
        self.scores.columns=[f"Comp{str(i)}" for i in
                             range(1,self.scores.shape[1]+1)]
        if invert:
            invvalues=all([x==1 or x==-1 for x in invert])
            assert isinstance(invert, list), "'invert' must be a list"
            assert invvalues, "Value in 'invert' must be either 1 or -1"
            assert len(invert)==self.scores.shape[1], "'invest' must have as "+\
            f"many elements as the components of the PCAs computed: "+\
            f"{self.scores.shape[1]}"
            for i, mult in enumerate(invert):
                self.scores[f"Comp{i+1}"]=mult*self.scores[f"Comp{i+1}"] 
Beispiel #5
0
 def test_plot_row_principal_coordinates(self):
     pca = prince.PCA(n_components=4)
     pca.fit(self.X)
     ax = pca.plot_row_principal_coordinates(self.X)
     self.assertTrue(isinstance(ax, mpl.axes.Axes))
Beispiel #6
0
 def test_fit_pandas_dataframe(self):
     pca = prince.PCA(n_components=2)
     self.assertTrue(isinstance(pca.fit(pd.DataFrame(self.X)), prince.PCA))
Beispiel #7
0
 def test_fit_numpy_array(self):
     pca = prince.PCA(n_components=2)
     self.assertTrue(isinstance(pca.fit(self.X), prince.PCA))
Beispiel #8
0
def dim_reduce_init(y,
                    n_clusters,
                    k,
                    r,
                    nj,
                    var_distrib,
                    use_famd=False,
                    seed=None):
    ''' Perform dimension reduction into a continuous r dimensional space and determine 
    the init coefficients in that space
    
    y (numobs x p ndarray): The observations containing categorical variables
    n_clusters (int): The number of clusters to look for in the data
    k (1d array): The number of components of the latent Gaussian mixture layers
    r (int): The dimension of latent variables
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    var_distrib (p 1darray): An array containing the types of the variables in y 
    use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the 
                    first continuous latent variable. Otherwise MCA is used.
    seed (None): The random state seed to use for the dimension reduction
    ---------------------------------------------------------------------------------------
    returns (dict): All initialisation parameters
    '''

    L = len(k)
    numobs = len(y)
    S = np.prod(k)

    #==============================================================
    # Dimension reduction performed with MCA
    #==============================================================

    if type(y) != pd.core.frame.DataFrame:
        raise TypeError('y should be a dataframe for prince')

    if (np.array(var_distrib) == 'ordinal').all():
        print('PCA init')

        pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\
            rescale_with_std=True, copy=True, check_input=True, engine='auto',\
                random_state = seed)
        z1 = pca.fit_transform(y).values

    elif use_famd:
        famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \
                               engine='auto', random_state = seed)
        z1 = famd.fit_transform(y).values

    else:
        # Check input = False to remove
        mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\
                         check_input=False, engine='auto', random_state = seed)
        z1 = mca.fit_transform(y).values

    z = [z1]
    y = y.values

    #==============================================================
    # Set the shape parameters of each data type
    #==============================================================

    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\
                               var_distrib == 'binomial')].astype(int)
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\
                              var_distrib == 'binomial')]
    nb_bin = len(nj_bin)

    y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int)
    nj_ord = nj[var_distrib == 'ordinal']
    nb_ord = len(nj_ord)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical']
    nb_categ = len(nj_categ)

    # Set y_count standard error to 1
    y_cont = y[:, var_distrib == 'continuous']

    # Before was np.float
    y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True)
    nb_cont = y_cont.shape[1]

    #=======================================================
    # Determining the Gaussian Parameters
    #=======================================================
    init = {}

    eta = []
    H = []
    psi = []
    paths_pred = np.zeros((numobs, L))

    for l in range(L):
        params = get_MFA_params(z[l], k[l], r[l:])
        eta.append(params['eta'][..., n_axis])
        H.append(params['H'])
        psi.append(params['psi'])
        z.append(params['z_nextl'])
        paths_pred[:, l] = params['classes']

    paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0)
    paths, nb_paths = add_missing_paths(k, paths, nb_paths)

    w_s = nb_paths / numobs
    w_s = np.where(w_s == 0, 1E-16, w_s)

    # Check all paths have been explored
    if len(paths) != S:
        raise RuntimeError('Real path len is', S, 'while the initial number', \
                           'of path was only',  len(paths))

    w_s = w_s.reshape(*k).flatten('C')

    #=============================================================
    # Enforcing identifiability constraints over the first layer
    #=============================================================

    H = diagonal_cond(H, psi)
    Ez, AT = compute_z_moments(w_s, eta, H, psi)
    eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

    init['eta'] = eta
    init['H'] = H
    init['psi'] = psi

    init['w_s'] = w_s  # Probabilities of each path through the network
    init['z'] = z

    # The clustering layer is the one used to perform the clustering
    # i.e. the layer l such that k[l] == n_clusters
    clustering_layer = np.argmax(np.array(k) == n_clusters)

    init[
        'classes'] = paths_pred[:,
                                clustering_layer]  # 0 To change with clustering_layer_idx

    #=======================================================
    # Determining the coefficients of the GLLVM layer
    #=======================================================

    # Determining lambda_bin coefficients.

    lambda_bin = np.zeros((nb_bin, r[0] + 1))

    for j in range(nb_bin):
        Nj = np.max(y_bin[:, j])  # The support of the jth binomial is [1, Nj]

        if Nj == 1:  # If the variable is Bernoulli not binomial
            yj = y_bin[:, j]
            z_new = z[0]
        else:  # If not, need to convert Binomial output to Bernoulli output
            yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0])

        lr = LogisticRegression()

        if j < r[0] - 1:
            lr.fit(z_new[:, :j + 1], yj)
            lambda_bin[j, :j + 2] = np.concatenate(
                [lr.intercept_, lr.coef_[0]])
        else:
            lr.fit(z_new, yj)
            lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]])

    ## Identifiability of bin coefficients
    lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0]

    # Determining lambda_ord coefficients
    lambda_ord = []

    for j in range(nb_ord):
        Nj = len(np.unique(
            y_ord[:, j], axis=0))  # The support of the jth ordinal is [1, Nj]
        yj = y_ord[:, j]

        ol = OrderedLogit()
        ol.fit(z[0], yj)

        ## Identifiability of ordinal coefficients
        beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten()
        lambda_ord_j = np.concatenate([ol.alpha_, beta_j])
        lambda_ord.append(lambda_ord_j)

    # Determining the coefficients of the continuous variables
    lambda_cont = np.zeros((nb_cont, r[0] + 1))

    for j in range(nb_cont):
        yj = y_cont[:, j]
        linr = LinearRegression()

        if j < r[0] - 1:
            linr.fit(z[0][:, :j + 1], yj)
            lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_],
                                                     linr.coef_])
        else:
            linr.fit(z[0], yj)
            lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_])

    ## Identifiability of continuous coefficients
    lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0]

    # Determining lambda_categ coefficients
    lambda_categ = []

    for j in range(nb_categ):
        yj = y_categ[:, j]

        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(z[0], yj)

        ## Identifiability of categ coefficients
        beta_j = lr.coef_ @ AT[0][0]
        lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j]))

    init['lambda_bin'] = lambda_bin
    init['lambda_ord'] = lambda_ord
    init['lambda_cont'] = lambda_cont
    init['lambda_categ'] = lambda_categ

    return init
# Transform few numerical features to categorical because of their meaning
comb['MSSubClass'] = comb['MSSubClass'].astype(str)
comb['MoSold'] = comb['MoSold'].astype(str)
#%%
""" Dimensionality reduction """
# ATTENTION: TIME CONSUMING
# DO NOT USE TOGETHER WITH FEATURE IMPORTANCE ANALYSIS
dim_red = False
if dim_red:
    import prince
    # Here you can choose between PCA and FAMD
    pca = True
    if pca:
        # One-hot encoding
        dummies = pd.get_dummies(comb)
        pca = prince.PCA(n_components=50)
        pca = pca.fit(dummies)
        expl = (pca.explained_inertia_)
        cum = (np.cumsum(expl))[-1]
        print("Explained variance " + str(cum))
        dummies = pca.transform(dummies)
    else:
        famd = prince.FAMD(n_components=50)
        famd = famd.fit(comb)
        expl = (famd.explained_inertia_)
        cum = (np.cumsum(expl))[-1]
        print("Explained variance " + str(cum))
        comb = famd.transform(comb)
        # One-hot encoding
        dummies = pd.get_dummies(comb)
Beispiel #10
0
import numpy as np
import Data_util
import prince
from sklearn import model_selection
from sklearn.cross_decomposition import CCA

data = Data_util.read_data("data/adult.data")
training_data, training_labels = Data_util.class2vect(data)
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    training_data, training_labels, train_size=0.7, test_size=0.3)

pca = prince.PCA(n_components=70,
                 n_iter=3,
                 copy=True,
                 rescale_with_mean=True,
                 rescale_with_std=True,
                 engine='auto',
                 random_state=42)

pca = pca.fit(X_train)

print([100 * ei for ei in pca.explained_inertia_])
print(sum(pca.explained_inertia_))
print(pca.row_coordinates(X_test[:5]))
print(pca)
feature_names = data.columns.str.startswith("var_")
predictors = data[data.columns[feature_names]]
labels = data["Target_Practice"]

ix_training = data.train == 1
training_data = predictors[ix_training]
training_labels = labels[ix_training]

ix_testing = data.train == 0
testing_data = predictors[ix_testing]
testing_labels = labels[ix_testing]

sns.displot(training_data.values.flatten(), bins="sqrt", kde=True)

pca = prince.PCA(n_components=2, as_array=False).fit(training_data)
pca.plot_row_coordinates(training_data, color_labels=training_labels)
pca.column_correlations(training_data).plot.scatter(x=0,
                                                    y=1)  # weird column name

#%% Roshan Sharma model

mdl_data = { # problem with JSON dump => cast to python native type
    'N': ix_training.sum().tolist(),
    'N2': ix_testing.sum().tolist(),
    'K': feature_names.sum().tolist(),
    'y': training_labels.values.tolist(),
    'X': training_data.values.tolist(),
    'new_X': testing_data.values.tolist(),
}
Pokedex_Types = pd.concat([Pokedex_Types, dfOneHot], axis=1)

dfOneHot = pd.DataFrame(
    t2_ohe_array,
    columns=["type2_" + str(int(i)) for i in range(t2_ohe_array.shape[1])])
Pokedex_Types = pd.concat([Pokedex_Types, dfOneHot], axis=1)

Pokedex_Types_PCA = Pokedex_Types.drop(Pokedex_Types.columns[[0, 1, 2, 3, 4]],
                                       axis=1)

mca = prince.MCA(n_components=37,
                 n_iter=100,
                 copy=False,
                 engine='auto',
                 random_state=42)
mca = mca.fit(Pokedex_Types_PCA)
print(np.sum(mca.explained_inertia_))

Types_MCA = is_Legendary[['type1', 'type2']]
mca2 = mca.fit(Types_MCA)
print(np.sum(mca2.explained_inertia_))

pca2 = prince.PCA(
    n_components=9,
    engine='sklearn',
    rescale_with_mean=False,
    rescale_with_std=False,
)
pca3 = pca2.fit(Pokedex_Types_PCA)
print(np.sum(pca3.explained_inertia_))
# Estimation, calcul des composantes principales
C = pca.fit(X_quant).transform(X_quant)

# Explication du pourcentage de variance de chaque variable
print('Explication du pourcentage de variance de chaque variable: %s' %
      str(pca.explained_variance_ratio_))

# Décroissance de la variance expliquée
plt.plot(pca.explained_variance_ratio_)

# Affichage graphique
plt.boxplot(C[:, 0:5])
plt.scatter(C[:, 0], C[:, 1], c=target_name, label=[0, 1])

# Cercle des corrélations
cercle = prince.PCA(X_quant, n_components=2)
cercle.plot_correlation_circle()

# Visualisation de la matrice de corrélation
corr = (X_quant.corr())
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr,
            mask=np.zeros_like(corr, dtype=np.bool),
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True,
            ax=ax)

#Etude sur la variable temporelle "steps"
distinct_step = fichier_credit[temps].unique()

#Sommes échangées chronologiquement
Beispiel #14
0
import os
import sys

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import prince
from sklearn import datasets

X, y = datasets.load_iris(return_X_y=True)

pca = prince.PCA(rescale_with_mean=True, rescale_with_std=True,
                 n_components=2).fit(X)

print('Eigenvalues')
print(pca.eigenvalues_)
print(pca.explained_inertia_)
print('---')

print('U')
print(pca.U_[:5])
print('---')

print('V')
print(pca.V_)
print('---')

print('s')
print(pca.s_)
print('---')

print('Row coords')
Beispiel #15
0
def Preprocess(data_frame,
               target=None,
               method='FAMD',
               samples=None,
               mapper=None,
               num_components=3,
               scaler=None,
               encode_method='Binary',
               target_encoder=None,
               data_encoder=None,
               data_columns_dict=None,
               target_column_dict=None,
               groups=None,
               normalization='l2'):

    # If no target supplied get as target the last column of df
    if not target: target = data_frame.columns.values.tolist()[-1]
    ''' TO DO: Fix PCA. '''
    if method == 'PCA':
        print('Dummy is not functionning proberly.')
        method = 'MFA'

    normalization = normalization.lower()
    if normalization not in ['l1', 'l2', 'max', 'standard', None]:
        print('Not a valid normalization method change to None')
        normalization = None

    if samples is not None:

        # Sample the data set, Split to training and testing sets.
        train_data = data_frame.loc[samples.iloc[:, :-1].values.flatten(), :]
        test_data = data_frame.loc[samples.iloc[:, -1].values.flatten(), :]
        train_target = train_data[target].copy()
        test_target = test_data[target].copy()
        train_data = train_data.drop(columns=[target])
        test_data = test_data.drop(columns=[target])

        # Encode the data sets
        train_data, data_encoder, data_columns_dict = Fit_Encode(
            train_data, method=encode_method)
        test_data, _, _ = Fit_Encode(test_data,
                                     mappings=data_encoder,
                                     columns_dict=data_columns_dict,
                                     method=encode_method)
        #print('Test','\n',train_data.iloc[0])
        train_target, target_encoder, target_column_dict = Fit_Encode(
            train_target, method=encode_method)
        test_target, _, _ = Fit_Encode(test_target,
                                       mappings=target_encoder,
                                       columns_dict=target_column_dict,
                                       method=encode_method)

    else:  # If no samples are supplied we process the entire data set as a whole.
        test_data = data_frame.copy()
        test_target = test_data[target].copy()
        test_data = test_data.drop(columns=[target])
        test_data, test_data_encoder, test_columns_dict = Fit_Encode(
            test_data,
            mappings=data_encoder,
            columns_dict=data_columns_dict,
            method=encode_method)
        print('Test Data Encoded')
        test_target, test_target_encoder, _ = Fit_Encode(
            test_target,
            mappings=target_encoder,
            columns_dict=target_column_dict,
            method=encode_method)
        print('Test targets encoded')

        # Drop the income column from data sets and get normalized vectors

    if method == 'MFA':

        if not groups:
            groups = {}
            for key in data_columns_dict.keys():
                names = ['_' + s for s in data_columns_dict[key]]
                column_headers = [x + y for x, y in it.product([key], names)]
                groups[key] = column_headers

        if not mapper:  # Create FAMD mapper.
            print('No mapper found')
            ''' Consider passing **kwargs in Preprocess func. to pass in mappers. '''
            mfa = pr.MFA(
                groups=groups,
                n_components=num_components,
                n_iter=100,
                #rescale_with_mean = True, # Does not work. Can use sklearn Standard scaller.
                #rescale_with_std = True,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=None)

        print('Fitting MFA')
        if samples is not None:

            # Vectors for training/test set
            mapper = mfa.fit(train_data)
            vecs_train = pd.DataFrame(mapper.row_coordinates(train_data))
            vecs_test = pd.DataFrame(mapper.transform(test_data))

            vecs_train, scaler = Normalization(vecs_train, normalization,
                                               scaler)
            vecs_test, scaler = Normalization(vecs_test, normalization, scaler)

            return vecs_train, train_target, vecs_test, test_target, data_columns_dict, target_column_dict, data_encoder, target_encoder, groups, target, mapper, scaler

        else:
            # Get the vectors created for the training set and normalise
            vecs_test = pd.DataFrame(mapper.transform(test_data))

            vecs_test, scaler = Normalization(vecs_test, normalization, scaler)
            ''' Consider returning a single dictionary with all parameters. Each case has 
                different number of returned variables.'''

            return vecs_test, test_target, test_data_encoder, test_target_encoder, mapper, target, scaler

    elif method == 'PCA':

        if not mapper:

            mapper = pr.PCA(n_components=num_components,
                            n_iter=100,
                            rescale_with_mean=True,
                            rescale_with_std=True,
                            copy=True,
                            check_input=True,
                            engine='auto',
                            random_state=None)

        if samples is not None:

            pca_train = mapper.fit(train_data)
            vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data))
            pca_test = mapper.transform(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))

            if normalization in ['l1', 'l2', 'max']:
                scaler = None
                vecs_train = pd.DataFrame(preprocessing.normalize(
                    vecs_train, norm=normalization, axis=1),
                                          columns=vecs_train.columns)

                vecs_test = pd.DataFrame(preprocessing.normalize(
                    vecs_test, norm=normalization, axis=1),
                                         columns=vecs_test.columns)

            elif normalization == 'standard':
                scaler = preprocessing.StandardScaler()
                vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train),
                                          columns=vecs_train.columns)
                vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test),
                                         columns=vecs_test.columns)

            return vecs_train, train_target, vecs_test, test_target, target_encoders, data_endoder, mapper, target, scaler

        else:

            test_data, data_endoder = encode_categorical(
                test_data[target].copy(),
                encode_method=encode_method,
                encoder=data_encoder)

            pca_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))

            if normalization in ['l1', 'l2', 'max']:
                scaler = None
                vecs_test = pd.DataFrame(preprocessing.normalize(
                    vecs_test, norm=normalization, axis=1),
                                         columns=vecs_test.columns)

            elif normalization == 'standard':
                scaler = preprocessing.StandardScaler()
                vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test),
                                         columns=vecs_test.columns)

            return vecs_test, test_target, mapper, target
Beispiel #16
0
 def test_explained_inertia_(self):
     pca = prince.PCA(n_components=4)
     pca.fit(self.X)
     self.assertTrue(np.isclose(sum(pca.explained_inertia_), 1))
Beispiel #17
0
import pandas as pd
import prince
import matplotlib.pyplot as plt

# Generate tain and test data
# df = pd.read_csv('data/datalab_persona_run1_with_scale_cont.csv')
df = pd.read_csv('data/iris.csv')

pca = prince.PCA(df, n_components=-1)

# Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1)

components = (0, 1)

pca.plot_rows(axes=components, color_by='class', ellipse_fill=True)
pca.plot_correlation_circle(axes=components)
pca.plot_cumulative_inertia()
pca.plot_inertia()

plt.show()