def apply_mca_df_modalities_contributions(df, nb_factors): """ This function calculates the contributions of the modalities for the factors of the mca. If the value of the projection is high, this means the modality has a high contribution to this axis. """ # table_modalities_mca is the dataframe where the categories are projected onto the factor # the benzecri coeff is used ncols = len( df.columns.get_level_values(0).unique() ) # number of variables in data_mca (here 46), maybe there is a faster way to calculate it # benzecri correction is applied, the eigenvalues below 1/K are dropped # (where K is the number of variables, here 46 (sexe, age, ...)). # A coefficient with a factor K/(K-1) is also applied to the remaining variables. mca_ben = mca.MCA(df, ncols=ncols, benzecri=True) table_modalities_mca_contribution = pd.DataFrame( columns=df.columns, index=pd.MultiIndex.from_product([['contributions'], range(1, nb_factors + 1)])) # print(table_modalities_mca.shape, mca_ben.fs_c(N=20).T.shape, mca_ben.fs_c(N=20).T) table_modalities_mca_contribution.loc['contributions', :] = mca_ben.cont_c( N=10).T * 1000 table_modalities_mca_contribution = np.round( table_modalities_mca_contribution.astype(float), 1) return table_modalities_mca_contribution
def run_mca(X): ''' Perform Multiple Correspondence Analysis (MCA) on input array. :param X: array for which MCA is to be performed :returns: MCA instance and transformed array ''' mca_ben = mca.MCA(X) X = mca_ben.fs_r(1) return mca_ben, X
def apply_mca(df, benzecri): """ This function creates the object MCA : it applies a multiple analysis components on a disjunctive array A MCA will try to create new features by combining the former ones in order to have the fewer new features keeping the maximum information (test chi2) The correction of benzecri is """ # number of variables in data_mca (here 46), maybe there is a faster way to calculate it ncols = len(df.columns.get_level_values(0).unique()) mca_ = mca.MCA(df, ncols=ncols, benzecri=benzecri) return mca_
def get_data(df, lable,processing='standardization'): X = df.astype('int64') X_continue = X.drop(df.columns[9:], axis=1) X_discret = X.drop(df.columns[0:9], axis=1) #X = np.array(df) lable = lable.values if processing == 'scaler': X_continue = preprocessing.MinMaxScaler().fit_transform(X_continue) elif processing == 'standardization': X_continue = preprocessing.StandardScaler().fit_transform(X_continue) mca_counts = mca.MCA(X_discret) X_discret = mca_counts.fs_r_sup(X_discret, 18) data = np.append(np.concatenate((X_continue,X_discret),axis=1),lable[:,None],axis=1) return data
def apply_mca_df_modalities(df, df_label_disj_, nb_factors): """ This function calculates the projection of the modalities onto the factors of the mca. If the value of the projection is high, this means the modality has a high contribution to this axis. """ # table_modalities_mca is the dataframe where the categories are projected onto the factor # the benzecri coeff is used ncols = len( df.columns.get_level_values(0).unique() ) # number of variables in data_mca (here 46), maybe there is a faster way to calculate it # benzecri correction is applied, the eigenvalues below 1/K are dropped (where K is the number of variables, # here 46 (sexe, age, ...)). A coefficient with a factor K/(K-1) is also applied to the remaining variables. mca_ben = mca.MCA(df, ncols=ncols, benzecri=True) fs = 'Factor' table_modalities_mca = pd.DataFrame(columns=df.columns, index=pd.MultiIndex.from_product( [[fs], range(1, nb_factors + 1)])) # print(table_modalities_mca.shape, mca_ben.fs_c(N=20).T.shape, mca_ben.fs_c(N=20).T) table_modalities_mca.loc[fs, :] = mca_ben.fs_c( N=nb_factors).T # selection of the N=10 first factor # projection of the modalities of the label ('a risque', etc) onto the factors fs_c_sup = mca_ben.fs_c_sup(df_label_disj_, N=nb_factors) table_modalities_mca.loc[fs, ('label', 'pas de risque')] = fs_c_sup[0] table_modalities_mca.loc[fs, ('label', 'a risque')] = fs_c_sup[1] table_modalities_mca.loc[fs, ('label', 'psychose')] = fs_c_sup[2] table_modalities_mca = np.round(table_modalities_mca.astype(float), 2) return table_modalities_mca
clusters10 = kmedoids_instance10.get_clusters() medoids10 = kmedoids_instance10.get_medoids() kmedoids_instance5 = kmedoids(dist5, initial_medoids, data_type='distance_matrix') kmedoids_instance5.process() clusters5 = kmedoids_instance5.get_clusters() medoids5 = kmedoids_instance5.get_medoids() # NEW CODE - CHANGED 13/11/2019 ########################################################################################################################################################################## ## Reduce dimensions using MCA algorithm dum = pd.get_dummies(exp_rest['categories']) num_col = len(dum.columns) mca_ben = mca.MCA(dum, ncols=num_col) teste = (mca_ben.fs_r()) factor = mca_ben.fs_r(N=2).T teste.L factort = factor.T factort[:, 0] exp_rest['Fac1'] = factort[:, 0].tolist() exp_rest['Fac2'] = factort[:, 1].tolist() mca = prince.MCA(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42)
def CA(dataframe): df_dummies = pd.get_dummies(dataframe) import mca mca_ben = mca.MCA(df_dummies, ncols=2) fs = mca_ben.fs_c(N=2).T # fs: Factor score plotdata = pd.DataFrame({ 'Factor1': fs[0], 'Factor2': fs[1], 'levelnames': df_dummies.columns }) plotdata.insert(3, 'Variable', np.empty(len(plotdata))) plotdata.insert(4, 'hue', np.empty(len(plotdata))) plotdata.insert(5, 'SKU(name)', np.empty(len(plotdata))) # 或plotdata_new = pd.DataFrame({'Factor1':plotdata.Factor1,'Factor2':plotdata.Factor2,'levelnames':plotdata.levelnames,'Variable':np.empty(len(plotdata))}) k = 0 for index, row in plotdata.iterrows(): plotdata.loc[k, ['Variable']] = row['levelnames'].split('_')[0] plotdata.loc[k, ['hue']] = row['levelnames'].split('_')[1] plotdata.loc[k, ['SKU(name)']] = row['levelnames'].split( '_')[1] + '_' + row['levelnames'].split('_')[2] k = k + 1 import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False small_name_num = (plotdata['hue'].groupby( plotdata['hue']).count().size) - 2 ii = 65 list_letter = [] for num in range(0, small_name_num): list_letter = list_letter + [chr(ii)] ii = ii + 1 list_letter = ['F', 'M'] + list_letter smallname_to_letter = dict( zip(plotdata['hue'].groupby(plotdata['hue']).count().index, list_letter)) plotdata['letter'] = plotdata['hue'].map(smallname_to_letter) plotdata.insert(7, 'SKU(letter)', np.empty(len(plotdata))) k = 0 for index, row in plotdata.iterrows(): plotdata.loc[k, ['SKU(letter)']] = row['letter'] + '_' + row[ 'levelnames'].split('_')[2] k = k + 1 sns.lmplot(x="Factor1", y="Factor2", hue="hue", data=plotdata, fit_reg=False, markers=["^", "^"] + ["o"] * small_name_num, palette="Set1") labels = plotdata['SKU(letter)'] for label, x, y in zip(labels, plotdata.Factor1, plotdata.Factor2): plt.annotate(label, xy=(x, y), xytext=(-5, 5), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), fontsize=5, arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) return plotdata
plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Percentage of enterprises in the largest cluster (%)') plt.title('Percentage of enterprises in the largest cluster') fig.savefig(''.join([ 'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/', 'LargestCluster', '.pdf' ])) # Gamma Value: 0.49999999999999989 # Additional Analysis # # MCA & PCA Analysis # F = 20 x_dummy = mca.dummy(df_norm.iloc[:, [-3, -2, -1]]) mca_ben = mca.MCA(x_dummy, ncols=3) explained_variance = mca_ben.expl_var(greenacre=False, N=F) * 100 explained_variance.sum() # MCA Explained Variance # MCAcolumns = [("F" + str(i + 1)) for i in range(F)] fig, Graph = plt.subplots() Graph =, explained_variance, align='center', alpha=0.5) plt.xticks(np.arange(len(MCAcolumns)), MCAcolumns) plt.ylabel('Percentage') plt.title('Explained Variance by Factor (%): Multiple Correspondence Analysis') fig.savefig(''.join([
row = [ os.path.basename(x) for x in np.array(data.loc[:, 0:0].values[1:]).flatten() ] bod = data.loc[1:, 1:].astype(float).values adata = pandas.DataFrame(data=bod, index=row, columns=col) return adata else: adata = pandas.read_table(csvFile, skiprows=0, index_col=0, sep=separator) return adata df = DataFrame(csvFile) mca_ben = mca.MCA(df, ncols=df.shape[1], benzecri=False) result_row = pandas.DataFrame(mca_ben.fs_r(N=2)) result_row.index = list(df.index) result_row.to_csv(sys.stdout, sep='\t', encoding='utf-8', header=False) print('') result_col = pandas.DataFrame(mca_ben.fs_c(N=2)) result_col.index = list(df.columns) result_col.to_csv(sys.stdout, sep='\t', encoding='utf-8', header=False)
parameters = { 'n_estimators': [50, 100, 150, 200], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 5, 10, 12, 18] } feature_importance_data_frame = tuning_RDF("tuning_accuracy", "accuracy", parameters, "Prétraitement_basique", "Prediction_label_unique") import mca mca_ben = mca.MCA(df[[ "CODE", "CODE_PARENT", "DIAMETREARBREAUNMETRE", "ESPECE", "FREQUENTATIONCIBLE", "GENRE_BOTA", "NOTEDIAGNOSTIC", "PRIORITEDERENOUVELLEMENT", "SOUS_CATEGORIE", "SOUS_CATEGORIE_DESC", "STADEDEDEVELOPPEMENT", "STADEDEVELOPPEMENTDIAG", "TROTTOIR", "VIGUEUR", "DEFAUT" ]]) mca_ind = mca.MCA(df[[ "CODE", "CODE_PARENT", "DIAMETREARBREAUNMETRE", "ESPECE", "FREQUENTATIONCIBLE", "GENRE_BOTA", "NOTEDIAGNOSTIC", "PRIORITEDERENOUVELLEMENT", "SOUS_CATEGORIE", "SOUS_CATEGORIE_DESC", "STADEDEDEVELOPPEMENT", "STADEDEVELOPPEMENTDIAG", "TROTTOIR", "VIGUEUR", "DEFAUT" ]], benzecri=False) mca_ind.expl_var(greenacre=False) ####PREDICTION MULTI LABEL#### ##Premières prédictions avec un traitement simple##
X_mcconnell = pd.get_dummies(X['mcconnell'].astype(str), prefix='mcconnell') X_cuomo = pd.get_dummies(X['cuomo'].astype(str), prefix='cuomo') X_newson = pd.get_dummies(X['newsom'].astype(str), prefix='newsom') X_biden = pd.get_dummies(X['biden'].astype(str), prefix='biden') X_pence = pd.get_dummies(X['pence'].astype(str), prefix='pence') X_cdc = pd.get_dummies(X['cdc'].astype(str), prefix='cdc') X = pd.concat([ X_trump, X_mnuchin, X_pelosi, X_mcconnell, X_cuomo, X_newson, X_biden, X_pence, X_cdc ], axis=1, sort=False) ncols = len(X.columns) mca_X = mca.MCA(X, ncols=ncols) print(mca_X.L) print(sum(mca_X.L)) N_eig_all = np.linspace(1, 100, 100, dtype=int) Expl_var_bn = [] Expl_var_bnga = [] for N_eig in N_eig_all: Expl_var_bn.append(np.sum(mca_X.expl_var(greenacre=False, N=N_eig))) Expl_var_bnga.append(np.sum(mca_X.expl_var(greenacre=True, N=N_eig))) sns.set() plt.figure(figsize=(8, 5)) plt.plot(N_eig_all, Expl_var_bn, label='Benzecri correction')
import numpy as np import pandas as pd import mca df = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv') target = df['FKSmoker'].values target = np.array([target, -(target-1)]).T df.drop(['FKSmoker'], inplace=True, axis=1) cols = [x for x in df.columns.values if x not in ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %']] df = pd.get_dummies(df, columns=cols) X = df.values ncols = len(df.columns.values) mca_ben = mca.MCA(X, ncols=ncols) mca_ind = mca.MCA(X, ncols=ncols, benzecri=False)
records = [] for i in range(0, len(df)): records.append([str(one_answer) for j in range(0, len(df.columns))]) #apply apriori itemsets, rules = apriori(one_answer, min_support=0.2, min_confidence=1) association_rules = apriori(one_answer, min_support=0.045, min_confidence=0.2, min_lift=3, min_length=2) association_results = list(association_rules) for item in association_rules: # first index of the inner list # Contains base item and add item pair = item[0] items = [x for x in pair] print("Rule: " + items[0] + " -> " + items[1]) #second index of the inner list print("Support: " + str(item[1])) #third index of the list located at 0th #of the third index of the inner list print("Confidence: " + str(item[2][0][2])) print("Lift: " + str(item[2][0][3])) print("=====================================") #Multiple Correspondence Analysis import mca mca_ben = mca.MCA(one_answer, ncols=len(one_answer.columns)) mca_ind = mca.MCA(one_answer, ncols=len(one_answer.columns), benzecri=False)
def apply_mca_df_patient_time(list_df_, index_period, nb_factors=10, benzecri=False): """ This function takes a list of df (disjunctive arrays), the index period (int between 0 and 4) and the nb of factors. It will apply the mca without the benzecri coeff It returns """ list_df = deepcopy( list_df_) # because the list is poped so it avoids an empty list df = list_df.pop(index_period) # number of variables in data_mca (here 46), maybe there is a faster way to calculate it ncols = len(df.columns.get_level_values(0).unique()) mca_ben = mca.MCA(df, ncols=ncols, benzecri=benzecri) # benzecri correction can be applied fs = 'Factor' # fs are the factor table_patients_mca = pd.DataFrame(columns=df.index, index=pd.MultiIndex.from_product( [[fs], range(1, nb_factors + 1)])) nb_patients = df.shape[0] table_patients_mca.loc[fs, :] = mca_ben.fs_r( N=nb_factors).T # add the N=10 first factor to the table_patients_mca # table_patients_mca = table_patients_mca #because their is an inversion of sign if index_period == 0: for t, df_ in enumerate(list_df): for i in df_.index: temp_array = np.array(df_.iloc[i]) # print(mca_ben.fs_r_sup(pd.DataFrame([temp_array]), N=nb_factors)) table_patients_mca.loc[fs, str(i) + '_t' + str(t + 1)] = -mca_ben.fs_r_sup( pd.DataFrame([temp_array]), N=nb_factors)[0] if index_period != 0: # rename the columns table_patients_mca.columns = [ str(table_patients_mca.columns[i]) + '_t' + str(index_period) for i in range(table_patients_mca.shape[1]) ] # rename the columns for t, df_ in enumerate(list_df): for i in df_.index: if t == 0: # if it s the 1st period, the name of the patient has no suffix table_patients_mca.loc[fs, str(i)] = -mca_ben.fs_r_sup( df_, N=nb_factors)[i] elif t != 0: table_patients_mca.loc[fs, str(i) + '_t' + str(t)] = -mca_ben.fs_r_sup( df_, N=nb_factors)[i] # not the good order of time cols = table_patients_mca.columns columns_patients_period = cols[:nb_patients] columns_other_periods = cols[nb_patients:] new_cols = columns_patients_period.to_list( ) + columns_other_periods.to_list() table_patients_mca = table_patients_mca[new_cols] table_patients_mca = np.round(table_patients_mca.astype(float), 2) # split into 5 df for each period list_tables_patients = [] for k in range(5): temp_df = table_patients_mca.iloc[:, k * nb_patients:(k + 1) * nb_patients] list_tables_patients.append(-temp_df) return list_tables_patients
# ---- # loading csv for 布置図 df = pd.read_csv(filePath, index_col=0, header=0) data = pd.read_csv(filePath, index_col=0) # loading sample score csv for cluster #df = pd.read_csv(scorefilePath,index_col=0) # ---- # category name rlabels = df.index # sample naem clabels = df.columns # mca model MCAmodel = mca.MCA(data, benzecri=False, TOL=1e-8) # ---- # row score (category) rows = pd.DataFrame(MCAmodel.fs_r(N=3)) print("カテゴリスコア:\n") print(rows) # columns score (sample) cols = pd.DataFrame(MCAmodel.fs_c(N=3)) print("サンプルスコア:\n") print(cols) print("----\n") # ---- # ----
df_dummies.shape df_dummies.head(3) # In[33]: df_dummies # In[38]: mca_ind = mca.MCA(df_dummies, benzecri=True) mca_ind # In[39]: len(mca_ind.L) #One factor for level inertias=mca_ind.L #Eigenvalues/principal inertias of each of the factors inertias # Factors of each observation # In[40]:
1, 1, ] } index = ('Black Oil Sunflower', 'Striped Sunflower', 'Hulled Sunflower', 'Millet White/Red', 'Milo Seed', 'Nyjer Seed (Thistle)', 'Shelled Peanuts', 'Safflower Seed', 'Corn Products') data = pd.DataFrame(data=data, index=index) #EXPLAINED VARIANCE DOES NOT SUM TO ONE #data = data.transpose() #print ("dummies\n") #print (pd.get_dummies(data, columns = list(data))) data_dummies = pd.get_dummies(data, columns=list(data)) mca_ben = mca.MCA(data_dummies, ncols=len(data_dummies.keys())) #print (mca_ben.fs_r(1)) #print (np.cumsum(mca_ben.expl_var())) #print (mca_ben.L) #print (mca_ben.inertia) print(len(mca_ben.fs_r(1)[1])) plt.figure() i = 0 for name in list(data.index): plt.text(mca_ben.fs_r(1)[0][i], mca_ben.fs_r(1)[1][i], name) i += 1 plt.scatter(mca_ben.fs_r(1)[0], mca_ben.fs_r(1)[1])
import mca import pandas as pd import numpy as np import sys import os sys.path.append(os.path.abspath('..')) from preprocessing import shroom_dealer df = shroom_dealer.get_data_frame() mca_ben = mca.mca(df,cols=["gill-color","stalk-surface-above-ring","ring-type","spore-print-color"], ncols=5) mca_ind = mca.MCA(df,cols=["gill-color","stalk-surface-above-ring","ring-type","spore-print-color"], ncols=5 benzecri=False)