Beispiel #1
0
def apply_mca_df_modalities_contributions(df, nb_factors):
    """ This function calculates the contributions of the modalities for the factors of the mca. If the value of the
    projection is high, this means the modality has a high contribution to this axis.

    """

    # table_modalities_mca is the dataframe where the categories are projected onto the factor
    # the benzecri coeff is used

    ncols = len(
        df.columns.get_level_values(0).unique()
    )  # number of variables in data_mca (here 46), maybe there is a faster way to calculate it
    # benzecri correction is applied, the eigenvalues below 1/K are dropped
    # (where K is the number of variables, here 46 (sexe, age, ...)).
    # A coefficient with a factor K/(K-1) is also applied to the remaining variables.
    mca_ben = mca.MCA(df, ncols=ncols, benzecri=True)

    table_modalities_mca_contribution = pd.DataFrame(
        columns=df.columns,
        index=pd.MultiIndex.from_product([['contributions'],
                                          range(1, nb_factors + 1)]))
    # print(table_modalities_mca.shape, mca_ben.fs_c(N=20).T.shape, mca_ben.fs_c(N=20).T)
    table_modalities_mca_contribution.loc['contributions', :] = mca_ben.cont_c(
        N=10).T * 1000

    table_modalities_mca_contribution = np.round(
        table_modalities_mca_contribution.astype(float), 1)

    return table_modalities_mca_contribution
def run_mca(X):
    '''
    Perform Multiple Correspondence Analysis (MCA) on input array.

    :param X: array for which MCA is to be performed
    :returns: MCA instance and transformed array
    '''
    mca_ben = mca.MCA(X)
    X = mca_ben.fs_r(1)
    return mca_ben, X
Beispiel #3
0
def apply_mca(df, benzecri):
    """
    This function creates the object MCA : it applies a multiple analysis components on a disjunctive array
    A MCA will try to create new features by combining the former ones in order to have the fewer new features keeping the maximum information (test chi2)
    The correction of benzecri is
    """
    # number of variables in data_mca (here 46), maybe there is a faster way to calculate it
    ncols = len(df.columns.get_level_values(0).unique())
    mca_ = mca.MCA(df, ncols=ncols, benzecri=benzecri)
    return mca_
def get_data(df, lable,processing='standardization'):
    
    X = df.astype('int64')
    X_continue = X.drop(df.columns[9:], axis=1)
    X_discret = X.drop(df.columns[0:9], axis=1)
    #X = np.array(df)
    lable = lable.values
    if processing == 'scaler':
        X_continue = preprocessing.MinMaxScaler().fit_transform(X_continue)
    elif processing == 'standardization':
        X_continue = preprocessing.StandardScaler().fit_transform(X_continue)
    mca_counts = mca.MCA(X_discret)
    X_discret = mca_counts.fs_r_sup(X_discret, 18)
    data = np.append(np.concatenate((X_continue,X_discret),axis=1),lable[:,None],axis=1)
    return data
Beispiel #5
0
def apply_mca_df_modalities(df, df_label_disj_, nb_factors):
    """
    This function calculates the projection of the modalities onto the factors of the mca. If the value of the
    projection is high, this means the modality has a high contribution to this axis.

    """

    # table_modalities_mca is the dataframe where the categories are projected onto the factor
    # the benzecri coeff is used

    ncols = len(
        df.columns.get_level_values(0).unique()
    )  # number of variables in data_mca (here 46), maybe there is a faster way to calculate it
    # benzecri correction is applied, the eigenvalues below 1/K are dropped (where K is the number of variables,
    # here 46 (sexe, age, ...)). A coefficient with a factor K/(K-1) is also applied to the remaining variables.
    mca_ben = mca.MCA(df, ncols=ncols, benzecri=True)

    fs = 'Factor'

    table_modalities_mca = pd.DataFrame(columns=df.columns,
                                        index=pd.MultiIndex.from_product(
                                            [[fs],
                                             range(1, nb_factors + 1)]))
    # print(table_modalities_mca.shape, mca_ben.fs_c(N=20).T.shape, mca_ben.fs_c(N=20).T)
    table_modalities_mca.loc[fs, :] = mca_ben.fs_c(
        N=nb_factors).T  # selection of the N=10 first factor
    # projection of the modalities of the label ('a risque', etc) onto the factors
    fs_c_sup = mca_ben.fs_c_sup(df_label_disj_, N=nb_factors)

    table_modalities_mca.loc[fs, ('label', 'pas de risque')] = fs_c_sup[0]
    table_modalities_mca.loc[fs, ('label', 'a risque')] = fs_c_sup[1]
    table_modalities_mca.loc[fs, ('label', 'psychose')] = fs_c_sup[2]

    table_modalities_mca = np.round(table_modalities_mca.astype(float), 2)

    return table_modalities_mca
Beispiel #6
0
clusters10 = kmedoids_instance10.get_clusters()
medoids10 = kmedoids_instance10.get_medoids()

kmedoids_instance5 = kmedoids(dist5,
                              initial_medoids,
                              data_type='distance_matrix')
kmedoids_instance5.process()
clusters5 = kmedoids_instance5.get_clusters()
medoids5 = kmedoids_instance5.get_medoids()

# NEW CODE - CHANGED 13/11/2019
##########################################################################################################################################################################
## Reduce dimensions using MCA algorithm
dum = pd.get_dummies(exp_rest['categories'])
num_col = len(dum.columns)
mca_ben = mca.MCA(dum, ncols=num_col)
teste = (mca_ben.fs_r())
factor = mca_ben.fs_r(N=2).T
teste.L

factort = factor.T
factort[:, 0]
exp_rest['Fac1'] = factort[:, 0].tolist()
exp_rest['Fac2'] = factort[:, 1].tolist()

mca = prince.MCA(n_components=2,
                 n_iter=3,
                 copy=True,
                 check_input=True,
                 engine='auto',
                 random_state=42)
def CA(dataframe):

    df_dummies = pd.get_dummies(dataframe)

    import mca
    mca_ben = mca.MCA(df_dummies, ncols=2)
    fs = mca_ben.fs_c(N=2).T  # fs: Factor score

    plotdata = pd.DataFrame({
        'Factor1': fs[0],
        'Factor2': fs[1],
        'levelnames': df_dummies.columns
    })

    plotdata.insert(3, 'Variable', np.empty(len(plotdata)))
    plotdata.insert(4, 'hue', np.empty(len(plotdata)))
    plotdata.insert(5, 'SKU(name)', np.empty(len(plotdata)))
    # 或plotdata_new = pd.DataFrame({'Factor1':plotdata.Factor1,'Factor2':plotdata.Factor2,'levelnames':plotdata.levelnames,'Variable':np.empty(len(plotdata))})

    k = 0
    for index, row in plotdata.iterrows():
        plotdata.loc[k, ['Variable']] = row['levelnames'].split('_')[0]
        plotdata.loc[k, ['hue']] = row['levelnames'].split('_')[1]
        plotdata.loc[k, ['SKU(name)']] = row['levelnames'].split(
            '_')[1] + '_' + row['levelnames'].split('_')[2]
        k = k + 1

    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.set(color_codes=True)

    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False

    small_name_num = (plotdata['hue'].groupby(
        plotdata['hue']).count().size) - 2

    ii = 65
    list_letter = []
    for num in range(0, small_name_num):
        list_letter = list_letter + [chr(ii)]
        ii = ii + 1

    list_letter = ['F', 'M'] + list_letter
    smallname_to_letter = dict(
        zip(plotdata['hue'].groupby(plotdata['hue']).count().index,
            list_letter))

    plotdata['letter'] = plotdata['hue'].map(smallname_to_letter)

    plotdata.insert(7, 'SKU(letter)', np.empty(len(plotdata)))

    k = 0
    for index, row in plotdata.iterrows():
        plotdata.loc[k, ['SKU(letter)']] = row['letter'] + '_' + row[
            'levelnames'].split('_')[2]
        k = k + 1

    sns.lmplot(x="Factor1",
               y="Factor2",
               hue="hue",
               data=plotdata,
               fit_reg=False,
               markers=["^", "^"] + ["o"] * small_name_num,
               palette="Set1")
    labels = plotdata['SKU(letter)']
    for label, x, y in zip(labels, plotdata.Factor1, plotdata.Factor2):
        plt.annotate(label,
                     xy=(x, y),
                     xytext=(-5, 5),
                     textcoords='offset points',
                     ha='right',
                     va='bottom',
                     bbox=dict(boxstyle='round,pad=0.5',
                               fc='yellow',
                               alpha=0.5),
                     fontsize=5,
                     arrowprops=dict(arrowstyle='->',
                                     connectionstyle='arc3,rad=0'))
    plt.show()
    return plotdata
Beispiel #8
0
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Percentage of enterprises in the largest cluster (%)')
plt.title('Percentage of enterprises in the largest cluster')
plt.show()
fig.savefig(''.join([
    'C:/Users/Jairo F Gudiño R/Desktop/Balance Sheet Commonality/',
    'LargestCluster', '.pdf'
]))
# Gamma Value: 0.49999999999999989

# Additional Analysis #
# MCA & PCA Analysis #
F = 20
x_dummy = mca.dummy(df_norm.iloc[:, [-3, -2, -1]])
mca_ben = mca.MCA(x_dummy, ncols=3)
explained_variance = mca_ben.expl_var(greenacre=False, N=F) * 100
explained_variance.sum()

# MCA Explained Variance #
MCAcolumns = [("F" + str(i + 1)) for i in range(F)]
fig, Graph = plt.subplots()
Graph = plt.bar(np.arange(len(MCAcolumns)),
                explained_variance,
                align='center',
                alpha=0.5)
plt.xticks(np.arange(len(MCAcolumns)), MCAcolumns)
plt.ylabel('Percentage')
plt.title('Explained Variance by Factor (%): Multiple Correspondence Analysis')
plt.show()
fig.savefig(''.join([
        row = [
            os.path.basename(x)
            for x in np.array(data.loc[:, 0:0].values[1:]).flatten()
        ]
        bod = data.loc[1:, 1:].astype(float).values
        adata = pandas.DataFrame(data=bod, index=row, columns=col)
        return adata
    else:
        adata = pandas.read_table(csvFile,
                                  skiprows=0,
                                  index_col=0,
                                  sep=separator)
        return adata


df = DataFrame(csvFile)

mca_ben = mca.MCA(df, ncols=df.shape[1], benzecri=False)

result_row = pandas.DataFrame(mca_ben.fs_r(N=2))
result_row.index = list(df.index)

result_row.to_csv(sys.stdout, sep='\t', encoding='utf-8', header=False)

print('')

result_col = pandas.DataFrame(mca_ben.fs_c(N=2))
result_col.index = list(df.columns)

result_col.to_csv(sys.stdout, sep='\t', encoding='utf-8', header=False)
Beispiel #10
0
parameters = {
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 12, 18]
}
feature_importance_data_frame = tuning_RDF("tuning_accuracy", "accuracy",
                                           parameters, "Prétraitement_basique",
                                           "Prediction_label_unique")

import mca

mca_ben = mca.MCA(df[[
    "CODE", "CODE_PARENT", "DIAMETREARBREAUNMETRE", "ESPECE",
    "FREQUENTATIONCIBLE", "GENRE_BOTA", "NOTEDIAGNOSTIC",
    "PRIORITEDERENOUVELLEMENT", "SOUS_CATEGORIE", "SOUS_CATEGORIE_DESC",
    "STADEDEDEVELOPPEMENT", "STADEDEVELOPPEMENTDIAG", "TROTTOIR", "VIGUEUR",
    "DEFAUT"
]])
mca_ind = mca.MCA(df[[
    "CODE", "CODE_PARENT", "DIAMETREARBREAUNMETRE", "ESPECE",
    "FREQUENTATIONCIBLE", "GENRE_BOTA", "NOTEDIAGNOSTIC",
    "PRIORITEDERENOUVELLEMENT", "SOUS_CATEGORIE", "SOUS_CATEGORIE_DESC",
    "STADEDEDEVELOPPEMENT", "STADEDEVELOPPEMENTDIAG", "TROTTOIR", "VIGUEUR",
    "DEFAUT"
]],
                  benzecri=False)
mca_ind.expl_var(greenacre=False)
####PREDICTION MULTI LABEL####

##Premières prédictions avec un traitement simple##
X_mcconnell = pd.get_dummies(X['mcconnell'].astype(str), prefix='mcconnell')
X_cuomo = pd.get_dummies(X['cuomo'].astype(str), prefix='cuomo')
X_newson = pd.get_dummies(X['newsom'].astype(str), prefix='newsom')
X_biden = pd.get_dummies(X['biden'].astype(str), prefix='biden')
X_pence = pd.get_dummies(X['pence'].astype(str), prefix='pence')
X_cdc = pd.get_dummies(X['cdc'].astype(str), prefix='cdc')

X = pd.concat([
    X_trump, X_mnuchin, X_pelosi, X_mcconnell, X_cuomo, X_newson, X_biden,
    X_pence, X_cdc
],
              axis=1,
              sort=False)

ncols = len(X.columns)
mca_X = mca.MCA(X, ncols=ncols)

print(mca_X.L)
print(sum(mca_X.L))

N_eig_all = np.linspace(1, 100, 100, dtype=int)

Expl_var_bn = []
Expl_var_bnga = []
for N_eig in N_eig_all:
    Expl_var_bn.append(np.sum(mca_X.expl_var(greenacre=False, N=N_eig)))
    Expl_var_bnga.append(np.sum(mca_X.expl_var(greenacre=True, N=N_eig)))

sns.set()
plt.figure(figsize=(8, 5))
plt.plot(N_eig_all, Expl_var_bn, label='Benzecri correction')
Beispiel #12
0
import numpy as np
import pandas as pd
import mca

df = pd.read_csv('data/datalab_persona_run1_with_scale_cat.csv')

target = df['FKSmoker'].values

target = np.array([target, -(target-1)]).T

df.drop(['FKSmoker'], inplace=True, axis=1)

cols = [x for x in df.columns.values if
        x not in ['Age Next at DOC', 'Height', 'Weight', 'Annual Salary', 'Travel %']]

df = pd.get_dummies(df, columns=cols)

X = df.values
ncols = len(df.columns.values)

mca_ben = mca.MCA(X, ncols=ncols)
mca_ind = mca.MCA(X, ncols=ncols, benzecri=False)
Beispiel #13
0
records = []  
for i in range(0, len(df)):  
    records.append([str(one_answer) for j in range(0, len(df.columns))])
#apply apriori
itemsets, rules = apriori(one_answer, min_support=0.2,  min_confidence=1)

association_rules = apriori(one_answer, min_support=0.045, min_confidence=0.2, min_lift=3, min_length=2)  
association_results = list(association_rules)
for item in association_rules:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================")

#Multiple Correspondence Analysis
import mca
mca_ben = mca.MCA(one_answer, ncols=len(one_answer.columns))
mca_ind = mca.MCA(one_answer, ncols=len(one_answer.columns), benzecri=False) 
Beispiel #14
0
def apply_mca_df_patient_time(list_df_,
                              index_period,
                              nb_factors=10,
                              benzecri=False):
    """ This function takes a list of df (disjunctive arrays), the index period (int between 0 and 4) and the nb of factors.

    It will apply the mca without the benzecri coeff

    It returns
    """
    list_df = deepcopy(
        list_df_)  # because the list is poped so it avoids an empty list
    df = list_df.pop(index_period)
    # number of variables in data_mca (here 46), maybe there is a faster way to calculate it
    ncols = len(df.columns.get_level_values(0).unique())
    mca_ben = mca.MCA(df, ncols=ncols,
                      benzecri=benzecri)  # benzecri correction can be applied
    fs = 'Factor'  # fs are the factor
    table_patients_mca = pd.DataFrame(columns=df.index,
                                      index=pd.MultiIndex.from_product(
                                          [[fs],
                                           range(1, nb_factors + 1)]))
    nb_patients = df.shape[0]
    table_patients_mca.loc[fs, :] = mca_ben.fs_r(
        N=nb_factors).T  # add the N=10 first factor to the table_patients_mca
    # table_patients_mca = table_patients_mca #because their is an inversion of sign

    if index_period == 0:
        for t, df_ in enumerate(list_df):
            for i in df_.index:
                temp_array = np.array(df_.iloc[i])
                # print(mca_ben.fs_r_sup(pd.DataFrame([temp_array]), N=nb_factors))

                table_patients_mca.loc[fs, str(i) + '_t' +
                                       str(t + 1)] = -mca_ben.fs_r_sup(
                                           pd.DataFrame([temp_array]),
                                           N=nb_factors)[0]

    if index_period != 0:
        # rename the columns
        table_patients_mca.columns = [
            str(table_patients_mca.columns[i]) + '_t' + str(index_period)
            for i in range(table_patients_mca.shape[1])
        ]  # rename the columns
        for t, df_ in enumerate(list_df):
            for i in df_.index:
                if t == 0:  # if it s the 1st period, the name of the patient has no suffix
                    table_patients_mca.loc[fs, str(i)] = -mca_ben.fs_r_sup(
                        df_, N=nb_factors)[i]
                elif t != 0:
                    table_patients_mca.loc[fs, str(i) + '_t' +
                                           str(t)] = -mca_ben.fs_r_sup(
                                               df_, N=nb_factors)[i]

        # not the good order of time
        cols = table_patients_mca.columns

        columns_patients_period = cols[:nb_patients]
        columns_other_periods = cols[nb_patients:]
        new_cols = columns_patients_period.to_list(
        ) + columns_other_periods.to_list()
        table_patients_mca = table_patients_mca[new_cols]

    table_patients_mca = np.round(table_patients_mca.astype(float), 2)

    # split into 5 df for each period
    list_tables_patients = []
    for k in range(5):
        temp_df = table_patients_mca.iloc[:, k * nb_patients:(k + 1) *
                                          nb_patients]
        list_tables_patients.append(-temp_df)

    return list_tables_patients
Beispiel #15
0
# ----
# loading csv for 布置図
df = pd.read_csv(filePath, index_col=0, header=0)
data = pd.read_csv(filePath, index_col=0)
# loading sample score csv for cluster
#df = pd.read_csv(scorefilePath,index_col=0)

# ----

# category name
rlabels = df.index
# sample naem
clabels = df.columns

# mca model
MCAmodel = mca.MCA(data, benzecri=False, TOL=1e-8)

# ----
# row score (category)
rows = pd.DataFrame(MCAmodel.fs_r(N=3))
print("カテゴリスコア:\n")
print(rows)

# columns score (sample)
cols = pd.DataFrame(MCAmodel.fs_c(N=3))
print("サンプルスコア:\n")
print(cols)
print("----\n")
# ----

# ----
Beispiel #16
0

df_dummies.shape
df_dummies.head(3)


# In[33]:


df_dummies


# In[38]:


mca_ind = mca.MCA(df_dummies, benzecri=True)
mca_ind


# In[39]:


len(mca_ind.L) #One factor for level
inertias=mca_ind.L  #Eigenvalues/principal inertias of each of the factors
inertias


# Factors of each observation

# In[40]:
Beispiel #17
0
        1,
        1,
    ]
}
index = ('Black Oil Sunflower', 'Striped Sunflower', 'Hulled Sunflower',
         'Millet White/Red', 'Milo Seed', 'Nyjer Seed (Thistle)',
         'Shelled Peanuts', 'Safflower Seed', 'Corn Products')
data = pd.DataFrame(data=data, index=index)

#EXPLAINED VARIANCE DOES NOT SUM TO ONE
#data = data.transpose()
#print ("dummies\n")
#print (pd.get_dummies(data, columns = list(data)))
data_dummies = pd.get_dummies(data, columns=list(data))

mca_ben = mca.MCA(data_dummies, ncols=len(data_dummies.keys()))
#print (mca_ben.fs_r(1))
#print (np.cumsum(mca_ben.expl_var()))
#print (mca_ben.L)
#print (mca_ben.inertia)

print(len(mca_ben.fs_r(1)[1]))

plt.figure()
i = 0
for name in list(data.index):
    plt.text(mca_ben.fs_r(1)[0][i], mca_ben.fs_r(1)[1][i], name)
    i += 1
plt.scatter(mca_ben.fs_r(1)[0], mca_ben.fs_r(1)[1])
plt.show(False)
import mca
import pandas as pd
import numpy as np

import sys
import os
sys.path.append(os.path.abspath('..'))

from preprocessing import shroom_dealer

df = shroom_dealer.get_data_frame()

mca_ben = mca.mca(df,cols=["gill-color","stalk-surface-above-ring","ring-type","spore-print-color"], ncols=5)
mca_ind = mca.MCA(df,cols=["gill-color","stalk-surface-above-ring","ring-type","spore-print-color"], ncols=5 benzecri=False)