def test_copy(self): XX = np.copy(self.X) pca = prince.PCA(n_components=2, copy=True) pca.fit(XX) np.testing.assert_array_equal(self.X, XX) pca = prince.PCA(n_components=2, copy=False) pca.fit(XX) self.assertRaises(AssertionError, np.testing.assert_array_equal, self.X, XX)
def acpReduction(self): pca = prince.PCA(self.data[list(self.data.columns[0:8])], n_components=-1) pca.plot_rows(ellipse_fill=True) pca.plot_correlation_circle() """pca = PCA(n_components=2)
def pca_prince(self, df, cols, color_labels=None): """ cols: a list of numerical column names color_labels: categorical label column Prince's implementation of 2D pca with visualiztion """ pca = prince.PCA(n_components=2, n_iter=3, rescale_with_mean=True, rescale_with_std=True, copy=True, check_input=True, engine='auto') pca = pca.fit(df[cols]) fig, ax = plt.subplots(figsize=(10, 10)) pca.plot_row_coordinates(df[cols], ax=ax, x_component=0, y_component=1, labels=None, color_labels=df[color_labels], ellipse_outline=False, ellipse_fill=True, show_points=True) fig.show() return pca
def __init__(self, data, *args, **kws): ''' Fit a PCA using prince module Inputs: data : pandas DataFrame with the variables to compute the PCAs ncomp : number of PCA components. Default equals the number of variables niter : The number of iterations used for computing the SVD. inplace : Whether to perform the computations inplace or not. Default is True seed : seed for the random state invert : list with indexes of the principal components to invert the axis (multiply by -1) Outputs: PCA object with fitted model and scores ''' # arguments ncomp=kws.get("ncomp", data.shape[1]) niter=kws.get('niter', 10) inplace=kws.get('inplace', True) seed=kws.get('seed', 666) invert=kws.get('invert', None) # pca model = prince.PCA( n_components=ncomp, n_iter=niter, rescale_with_mean=True, rescale_with_std=True, copy=inplace, check_input=True, engine='sklearn', random_state=seed ) self.data=data self.invert=invert self.fit = model.fit(data) self.scores = self.fit.fit_transform(data) self.scores.columns=[f"Comp{str(i)}" for i in range(1,self.scores.shape[1]+1)] if invert: invvalues=all([x==1 or x==-1 for x in invert]) assert isinstance(invert, list), "'invert' must be a list" assert invvalues, "Value in 'invert' must be either 1 or -1" assert len(invert)==self.scores.shape[1], "'invest' must have as "+\ f"many elements as the components of the PCAs computed: "+\ f"{self.scores.shape[1]}" for i, mult in enumerate(invert): self.scores[f"Comp{i+1}"]=mult*self.scores[f"Comp{i+1}"]
def test_plot_row_principal_coordinates(self): pca = prince.PCA(n_components=4) pca.fit(self.X) ax = pca.plot_row_principal_coordinates(self.X) self.assertTrue(isinstance(ax, mpl.axes.Axes))
def test_fit_pandas_dataframe(self): pca = prince.PCA(n_components=2) self.assertTrue(isinstance(pca.fit(pd.DataFrame(self.X)), prince.PCA))
def test_fit_numpy_array(self): pca = prince.PCA(n_components=2) self.assertTrue(isinstance(pca.fit(self.X), prince.PCA))
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, use_famd=False, seed=None): ''' Perform dimension reduction into a continuous r dimensional space and determine the init coefficients in that space y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data k (1d array): The number of components of the latent Gaussian mixture layers r (int): The dimension of latent variables nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable var_distrib (p 1darray): An array containing the types of the variables in y use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the first continuous latent variable. Otherwise MCA is used. seed (None): The random state seed to use for the dimension reduction --------------------------------------------------------------------------------------- returns (dict): All initialisation parameters ''' L = len(k) numobs = len(y) S = np.prod(k) #============================================================== # Dimension reduction performed with MCA #============================================================== if type(y) != pd.core.frame.DataFrame: raise TypeError('y should be a dataframe for prince') if (np.array(var_distrib) == 'ordinal').all(): print('PCA init') pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\ rescale_with_std=True, copy=True, check_input=True, engine='auto',\ random_state = seed) z1 = pca.fit_transform(y).values elif use_famd: famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \ engine='auto', random_state = seed) z1 = famd.fit_transform(y).values else: # Check input = False to remove mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\ check_input=False, engine='auto', random_state = seed) z1 = mca.fit_transform(y).values z = [z1] y = y.values #============================================================== # Set the shape parameters of each data type #============================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')].astype(int) nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int) nj_ord = nj[var_distrib == 'ordinal'] nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'] nb_categ = len(nj_categ) # Set y_count standard error to 1 y_cont = y[:, var_distrib == 'continuous'] # Before was np.float y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True) nb_cont = y_cont.shape[1] #======================================================= # Determining the Gaussian Parameters #======================================================= init = {} eta = [] H = [] psi = [] paths_pred = np.zeros((numobs, L)) for l in range(L): params = get_MFA_params(z[l], k[l], r[l:]) eta.append(params['eta'][..., n_axis]) H.append(params['H']) psi.append(params['psi']) z.append(params['z_nextl']) paths_pred[:, l] = params['classes'] paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0) paths, nb_paths = add_missing_paths(k, paths, nb_paths) w_s = nb_paths / numobs w_s = np.where(w_s == 0, 1E-16, w_s) # Check all paths have been explored if len(paths) != S: raise RuntimeError('Real path len is', S, 'while the initial number', \ 'of path was only', len(paths)) w_s = w_s.reshape(*k).flatten('C') #============================================================= # Enforcing identifiability constraints over the first layer #============================================================= H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) init['eta'] = eta init['H'] = H init['psi'] = psi init['w_s'] = w_s # Probabilities of each path through the network init['z'] = z # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) init[ 'classes'] = paths_pred[:, clustering_layer] # 0 To change with clustering_layer_idx #======================================================= # Determining the coefficients of the GLLVM layer #======================================================= # Determining lambda_bin coefficients. lambda_bin = np.zeros((nb_bin, r[0] + 1)) for j in range(nb_bin): Nj = np.max(y_bin[:, j]) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:, j] z_new = z[0] else: # If not, need to convert Binomial output to Bernoulli output yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0]) lr = LogisticRegression() if j < r[0] - 1: lr.fit(z_new[:, :j + 1], yj) lambda_bin[j, :j + 2] = np.concatenate( [lr.intercept_, lr.coef_[0]]) else: lr.fit(z_new, yj) lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]]) ## Identifiability of bin coefficients lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0] # Determining lambda_ord coefficients lambda_ord = [] for j in range(nb_ord): Nj = len(np.unique( y_ord[:, j], axis=0)) # The support of the jth ordinal is [1, Nj] yj = y_ord[:, j] ol = OrderedLogit() ol.fit(z[0], yj) ## Identifiability of ordinal coefficients beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten() lambda_ord_j = np.concatenate([ol.alpha_, beta_j]) lambda_ord.append(lambda_ord_j) # Determining the coefficients of the continuous variables lambda_cont = np.zeros((nb_cont, r[0] + 1)) for j in range(nb_cont): yj = y_cont[:, j] linr = LinearRegression() if j < r[0] - 1: linr.fit(z[0][:, :j + 1], yj) lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_], linr.coef_]) else: linr.fit(z[0], yj) lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_]) ## Identifiability of continuous coefficients lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0] # Determining lambda_categ coefficients lambda_categ = [] for j in range(nb_categ): yj = y_categ[:, j] lr = LogisticRegression(multi_class='multinomial') lr.fit(z[0], yj) ## Identifiability of categ coefficients beta_j = lr.coef_ @ AT[0][0] lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j])) init['lambda_bin'] = lambda_bin init['lambda_ord'] = lambda_ord init['lambda_cont'] = lambda_cont init['lambda_categ'] = lambda_categ return init
# Transform few numerical features to categorical because of their meaning comb['MSSubClass'] = comb['MSSubClass'].astype(str) comb['MoSold'] = comb['MoSold'].astype(str) #%% """ Dimensionality reduction """ # ATTENTION: TIME CONSUMING # DO NOT USE TOGETHER WITH FEATURE IMPORTANCE ANALYSIS dim_red = False if dim_red: import prince # Here you can choose between PCA and FAMD pca = True if pca: # One-hot encoding dummies = pd.get_dummies(comb) pca = prince.PCA(n_components=50) pca = pca.fit(dummies) expl = (pca.explained_inertia_) cum = (np.cumsum(expl))[-1] print("Explained variance " + str(cum)) dummies = pca.transform(dummies) else: famd = prince.FAMD(n_components=50) famd = famd.fit(comb) expl = (famd.explained_inertia_) cum = (np.cumsum(expl))[-1] print("Explained variance " + str(cum)) comb = famd.transform(comb) # One-hot encoding dummies = pd.get_dummies(comb)
import numpy as np import Data_util import prince from sklearn import model_selection from sklearn.cross_decomposition import CCA data = Data_util.read_data("data/adult.data") training_data, training_labels = Data_util.class2vect(data) X_train, X_test, y_train, y_test = model_selection.train_test_split( training_data, training_labels, train_size=0.7, test_size=0.3) pca = prince.PCA(n_components=70, n_iter=3, copy=True, rescale_with_mean=True, rescale_with_std=True, engine='auto', random_state=42) pca = pca.fit(X_train) print([100 * ei for ei in pca.explained_inertia_]) print(sum(pca.explained_inertia_)) print(pca.row_coordinates(X_test[:5])) print(pca)
feature_names = data.columns.str.startswith("var_") predictors = data[data.columns[feature_names]] labels = data["Target_Practice"] ix_training = data.train == 1 training_data = predictors[ix_training] training_labels = labels[ix_training] ix_testing = data.train == 0 testing_data = predictors[ix_testing] testing_labels = labels[ix_testing] sns.displot(training_data.values.flatten(), bins="sqrt", kde=True) pca = prince.PCA(n_components=2, as_array=False).fit(training_data) pca.plot_row_coordinates(training_data, color_labels=training_labels) pca.column_correlations(training_data).plot.scatter(x=0, y=1) # weird column name #%% Roshan Sharma model mdl_data = { # problem with JSON dump => cast to python native type 'N': ix_training.sum().tolist(), 'N2': ix_testing.sum().tolist(), 'K': feature_names.sum().tolist(), 'y': training_labels.values.tolist(), 'X': training_data.values.tolist(), 'new_X': testing_data.values.tolist(), }
Pokedex_Types = pd.concat([Pokedex_Types, dfOneHot], axis=1) dfOneHot = pd.DataFrame( t2_ohe_array, columns=["type2_" + str(int(i)) for i in range(t2_ohe_array.shape[1])]) Pokedex_Types = pd.concat([Pokedex_Types, dfOneHot], axis=1) Pokedex_Types_PCA = Pokedex_Types.drop(Pokedex_Types.columns[[0, 1, 2, 3, 4]], axis=1) mca = prince.MCA(n_components=37, n_iter=100, copy=False, engine='auto', random_state=42) mca = mca.fit(Pokedex_Types_PCA) print(np.sum(mca.explained_inertia_)) Types_MCA = is_Legendary[['type1', 'type2']] mca2 = mca.fit(Types_MCA) print(np.sum(mca2.explained_inertia_)) pca2 = prince.PCA( n_components=9, engine='sklearn', rescale_with_mean=False, rescale_with_std=False, ) pca3 = pca2.fit(Pokedex_Types_PCA) print(np.sum(pca3.explained_inertia_))
# Estimation, calcul des composantes principales C = pca.fit(X_quant).transform(X_quant) # Explication du pourcentage de variance de chaque variable print('Explication du pourcentage de variance de chaque variable: %s' % str(pca.explained_variance_ratio_)) # Décroissance de la variance expliquée plt.plot(pca.explained_variance_ratio_) # Affichage graphique plt.boxplot(C[:, 0:5]) plt.scatter(C[:, 0], C[:, 1], c=target_name, label=[0, 1]) # Cercle des corrélations cercle = prince.PCA(X_quant, n_components=2) cercle.plot_correlation_circle() # Visualisation de la matrice de corrélation corr = (X_quant.corr()) f, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), square=True, ax=ax) #Etude sur la variable temporelle "steps" distinct_step = fichier_credit[temps].unique() #Sommes échangées chronologiquement
import os import sys sys.path.insert(1, os.path.join(sys.path[0], '..')) import prince from sklearn import datasets X, y = datasets.load_iris(return_X_y=True) pca = prince.PCA(rescale_with_mean=True, rescale_with_std=True, n_components=2).fit(X) print('Eigenvalues') print(pca.eigenvalues_) print(pca.explained_inertia_) print('---') print('U') print(pca.U_[:5]) print('---') print('V') print(pca.V_) print('---') print('s') print(pca.s_) print('---') print('Row coords')
def Preprocess(data_frame, target=None, method='FAMD', samples=None, mapper=None, num_components=3, scaler=None, encode_method='Binary', target_encoder=None, data_encoder=None, data_columns_dict=None, target_column_dict=None, groups=None, normalization='l2'): # If no target supplied get as target the last column of df if not target: target = data_frame.columns.values.tolist()[-1] ''' TO DO: Fix PCA. ''' if method == 'PCA': print('Dummy is not functionning proberly.') method = 'MFA' normalization = normalization.lower() if normalization not in ['l1', 'l2', 'max', 'standard', None]: print('Not a valid normalization method change to None') normalization = None if samples is not None: # Sample the data set, Split to training and testing sets. train_data = data_frame.loc[samples.iloc[:, :-1].values.flatten(), :] test_data = data_frame.loc[samples.iloc[:, -1].values.flatten(), :] train_target = train_data[target].copy() test_target = test_data[target].copy() train_data = train_data.drop(columns=[target]) test_data = test_data.drop(columns=[target]) # Encode the data sets train_data, data_encoder, data_columns_dict = Fit_Encode( train_data, method=encode_method) test_data, _, _ = Fit_Encode(test_data, mappings=data_encoder, columns_dict=data_columns_dict, method=encode_method) #print('Test','\n',train_data.iloc[0]) train_target, target_encoder, target_column_dict = Fit_Encode( train_target, method=encode_method) test_target, _, _ = Fit_Encode(test_target, mappings=target_encoder, columns_dict=target_column_dict, method=encode_method) else: # If no samples are supplied we process the entire data set as a whole. test_data = data_frame.copy() test_target = test_data[target].copy() test_data = test_data.drop(columns=[target]) test_data, test_data_encoder, test_columns_dict = Fit_Encode( test_data, mappings=data_encoder, columns_dict=data_columns_dict, method=encode_method) print('Test Data Encoded') test_target, test_target_encoder, _ = Fit_Encode( test_target, mappings=target_encoder, columns_dict=target_column_dict, method=encode_method) print('Test targets encoded') # Drop the income column from data sets and get normalized vectors if method == 'MFA': if not groups: groups = {} for key in data_columns_dict.keys(): names = ['_' + s for s in data_columns_dict[key]] column_headers = [x + y for x, y in it.product([key], names)] groups[key] = column_headers if not mapper: # Create FAMD mapper. print('No mapper found') ''' Consider passing **kwargs in Preprocess func. to pass in mappers. ''' mfa = pr.MFA( groups=groups, n_components=num_components, n_iter=100, #rescale_with_mean = True, # Does not work. Can use sklearn Standard scaller. #rescale_with_std = True, copy=True, check_input=True, engine='auto', random_state=None) print('Fitting MFA') if samples is not None: # Vectors for training/test set mapper = mfa.fit(train_data) vecs_train = pd.DataFrame(mapper.row_coordinates(train_data)) vecs_test = pd.DataFrame(mapper.transform(test_data)) vecs_train, scaler = Normalization(vecs_train, normalization, scaler) vecs_test, scaler = Normalization(vecs_test, normalization, scaler) return vecs_train, train_target, vecs_test, test_target, data_columns_dict, target_column_dict, data_encoder, target_encoder, groups, target, mapper, scaler else: # Get the vectors created for the training set and normalise vecs_test = pd.DataFrame(mapper.transform(test_data)) vecs_test, scaler = Normalization(vecs_test, normalization, scaler) ''' Consider returning a single dictionary with all parameters. Each case has different number of returned variables.''' return vecs_test, test_target, test_data_encoder, test_target_encoder, mapper, target, scaler elif method == 'PCA': if not mapper: mapper = pr.PCA(n_components=num_components, n_iter=100, rescale_with_mean=True, rescale_with_std=True, copy=True, check_input=True, engine='auto', random_state=None) if samples is not None: pca_train = mapper.fit(train_data) vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data)) pca_test = mapper.transform(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) if normalization in ['l1', 'l2', 'max']: scaler = None vecs_train = pd.DataFrame(preprocessing.normalize( vecs_train, norm=normalization, axis=1), columns=vecs_train.columns) vecs_test = pd.DataFrame(preprocessing.normalize( vecs_test, norm=normalization, axis=1), columns=vecs_test.columns) elif normalization == 'standard': scaler = preprocessing.StandardScaler() vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train), columns=vecs_train.columns) vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test), columns=vecs_test.columns) return vecs_train, train_target, vecs_test, test_target, target_encoders, data_endoder, mapper, target, scaler else: test_data, data_endoder = encode_categorical( test_data[target].copy(), encode_method=encode_method, encoder=data_encoder) pca_test = mapper.fit(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) if normalization in ['l1', 'l2', 'max']: scaler = None vecs_test = pd.DataFrame(preprocessing.normalize( vecs_test, norm=normalization, axis=1), columns=vecs_test.columns) elif normalization == 'standard': scaler = preprocessing.StandardScaler() vecs_test = pd.DataFrame(scaler.fit_transform(vecs_test), columns=vecs_test.columns) return vecs_test, test_target, mapper, target
def test_explained_inertia_(self): pca = prince.PCA(n_components=4) pca.fit(self.X) self.assertTrue(np.isclose(sum(pca.explained_inertia_), 1))
import pandas as pd import prince import matplotlib.pyplot as plt # Generate tain and test data # df = pd.read_csv('data/datalab_persona_run1_with_scale_cont.csv') df = pd.read_csv('data/iris.csv') pca = prince.PCA(df, n_components=-1) # Set the axes you want to examine below, i.e. which component pair you are interested in - (0, 1) components = (0, 1) pca.plot_rows(axes=components, color_by='class', ellipse_fill=True) pca.plot_correlation_circle(axes=components) pca.plot_cumulative_inertia() pca.plot_inertia() plt.show()