Python FAMD Examples, prince.FAMD Python Examples

Example #1

0

Show file

File: FeatureEngineering.py Project: mathdoug/Project_ML2

    def dimension_reduction_famd(self, df):
        n_comp = df.shape[1]
        col = []
        for i in range(n_comp):
            col.append("FAMD_" + str(i + 1))

        if self.my_FAMD is None:
            self.my_FAMD = prince.FAMD(n_components=n_comp,
                                       n_iter=10,
                                       copy=True,
                                       check_input=True,
                                       engine='auto',
                                       random_state=42)
            self.my_FAMD = self.my_FAMD.fit(df.drop("Y", axis=1))
            print(self.my_FAMD.explained_inertia_)
        else:
            print("NAOOOOOOOO")
            print(df.drop("Y", axis=1).shape)
            print(self.my_FAMD)
        aux = self.my_FAMD.transform(df.drop("Y", axis=1))
        aux.columns = col
        aux = aux.join(df["Y"])
        df = aux

        print("Depois FAMD:")
        print(df.head())

        plt.figure(figsize=(12, 12))

        plt.scatter(df[df["Y"] == 0]["FAMD_3"],
                    df[df["Y"] == 0]["FAMD_4"],
                    color='red',
                    alpha=0.5,
                    label='0')
        plt.scatter(df[df["Y"] == 1]["FAMD_3"],
                    df[df["Y"] == 1]["FAMD_4"],
                    color='blue',
                    alpha=0.5,
                    label='1')
        plt.title("FAMD")
        plt.ylabel('Les coordonnees de Y')
        plt.xlabel('Les coordonnees de X')
        plt.legend()
        plt.show()

        plt.figure(figsize=(12, 12))
        sns.distplot(df[df["Y"] == 1]["FAMD_3"])
        sns.distplot(df[df["Y"] == 0]["FAMD_3"])
        plt.show()

        return df

Example #2

0

Show file

def dimension_reduction(X, n_components):
    print("Applying FAMD...")
    X = prince.FAMD(n_components=n_components,
                    n_iter=3,
                    copy=True,
                    check_input=True,
                    engine='auto',
                    random_state=42).fit_transform(X)
    # ax = famd.plot_row_coordinates(X,ax=None,figsize=(6, 6),x_component=0,
    #     y_component=1,ellipse_outline=True,ellipse_fill=True,show_points=False)
    # ax.get_figure().savefig('famd_row_coordinates.svg')
    print("Number of features after dimension reduction: " +
          str(len(X.columns)))
    return X

Example #3

0

Show file

def dim_reduct(df):
    '''
    INPUT: Dataframe
    OUTPUT: Dataframe after dimensionality reduction
    '''
    famd = prince.FAMD(n_components=10,
                       n_iter=10,
                       copy=True,
                       engine='auto',
                       random_state=42)
    famd = famd.fit(df)
    print(sum(famd.explained_inertia_))
    dim_red = famd.row_coordinates(df)

    return dim_red

Example #4

0

Show file

def getFAMDData(X, dataType):
    
    # vals = package.Unprocessed  
    X = createFAMDDataSets(X, dataType)
    # chartFAMD(X, vals[:, -1], dataType)  

    if dataType == 'Adult': 
        components = 6
    else:
        components = 8
 
    transformer = prince.FAMD(n_components=components, n_iter=3,copy=True,check_input=True,engine='auto', random_state=42) 
    fitted = transformer.fit(X)   
    transformed = transformer.transform(X)
    return transformed.values

Example #5

0

Show file

def dim_reduct(df):
    '''
    INPUT: Dataframe.
    OUTPUT: Dataframe with numerical columns scaled.
    '''
    famd = prince.FAMD(n_components=10,
                       n_iter=10,
                       copy=True,
                       engine='auto',
                       random_state=42)
    famd = famd.fit(df)
    print(sum(famd.explained_inertia_))
    dim_red = famd.row_coordinates(df)

    return dim_red

Example #6

0

Show file

def FAMD(num_components):
    famd = prince.FAMD(n_components=num_components,
                       n_iter=3,
                       copy=True,
                       check_input=True,
                       engine='auto',
                       random_state=0)

    df = DATA.fillna('None')
    df_reduced = df[CATEGORICAL_COLS_GOWER + NUMERICAL_COLS_GOWER]

    components = famd.fit(df_reduced).row_coordinates(df_reduced)
    components = components.rename(
        columns={i: "Component " + str(i + 1)
                 for i in range(num_components)})

    df = pd.concat([df, components], axis=1)

    return df

Example #7

0

Show file

File: Factor Analysis Of Facebook Live Data.py Project: nezihaksu/Projects

    def preprocessing_famd(self):
        data = self.df
        #Number of entries and columns.
        entries = data.shape[0]
        features = data.shape[1]
        #Dropping columns where all the entery values are NaN.
        data.dropna(axis=1, how='all', inplace=True)
        data.drop(['status_id'], axis=1, inplace=True)
        #Finding out number and percentage of missing values.
        bools = data.isnull().values
        #Turning nested list into one list.
        bools_flaten = list(np.array(bools).flat)
        percentage_empty = float(bools_flaten.count(True)) / float(
            len(bools_flaten))
        #Setting status_publish column into index.
        data = data.set_index('status_published')
        #FAMD
        famd = prince.FAMD(n_components=2,
                           n_iter=3,
                           copy=True,
                           check_input=True,
                           engine='auto',
                           random_state=42)
        famd = famd.fit(data)
        #Helps to see all columns.
        pd.set_option('display.max_columns', None)
        ax = famd.plot_row_coordinates(data,
                                       ax=None,
                                       figsize=(3, 3),
                                       x_component=0,
                                       y_component=1,
                                       labels=data.index,
                                       color_labels=[
                                           'status_type {}'.format(t)
                                           for t in data['status_type']
                                       ],
                                       ellipse_outline=False,
                                       ellipse_fill=True,
                                       show_points=True)
        plt.show()

        return data.shape[0], data.shape[1], percentage_empty

Example #8

0

Show file

def chartFAMD(X, y, dataType):

    
    title = '{0} FAMD'.format(dataType) 
    componentRange = range(1, X.shape[1] - 1)
    icaValues = []
    values = []

    components = len(componentRange) 
    transformer = prince.FAMD(n_components=components, n_iter=3,copy=True,check_input=True,engine='auto', random_state=42) 
    fitted = transformer.fit(X)  
    print(transformer.row_coordinates(X))

    transformed = transformer.transform(X)

    cum = np.cumsum(transformer.explained_inertia_)  


    fig, ax = plt.subplots()
    plt.title('FAMD - {0} data set'.format(dataType)) 
    plt.legend(loc='best') 

    ax.plot(componentRange, transformer.explained_inertia_, label='Variance')    
    ax.set_ylabel('Eigenvalue')  
    ax.set_xlabel('Components')
        
    ax2 = ax.twinx() 
    ax2.plot(componentRange, cum, linestyle='--', label='Cumulative Variance', color='orange')   
    ax2.set_ylabel('Explained Variance')   

  
    fig.legend(loc="upper right", bbox_to_anchor=(1,1), bbox_transform=ax.transAxes)
    fig.tight_layout()   

    # plt.show() 
    plt.savefig('{0}{1}.png'.format(path, title))
    plt.clf()

Example #9

0

Show file

def Preprocess(data_frame, target = None, method = 'FAMD', samples = None, 
               mapper = None, num_components = 3, scaler = None):
    
    # If no target supplied get as target the last column of df
    if not target: target = data_frame.columns.values.tolist()[-1] 
    
    ''' See below for this, there is a problem for now with Dummy '''
    if method == 'Dummy': 
        print('Dummy is not functionning proberly.')
        #method = 'PCA'

    if method == 'FAMD':
        
        if not mapper: # Create FAMD mapper. 
            ''' Consider passing **kwargs in Preprocess func. to pass in mappers. '''
            mapper = pr.FAMD(
                n_components = num_components,
                n_iter=100,
                #rescale_with_mean = True,
                #rescale_with_std = True,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=None
            )
        
        if samples is not None: # Sample the data set, Split to training and testing sets.
            
            train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:]
            test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:]

            # Create taining labels. This will give us a one-hot encoding for each class.
            train_target = pd.get_dummies(train_data[target]).astype('float64')
    
            # Create testing labels
            test_target = pd.get_dummies(test_data[target]).astype('float64')
    
            # Drop the income column from data sets.
            train_data = train_data.drop(columns = [target])
            test_data = test_data.drop(columns = [target])
    
            # Get the vectors created for the training set
            famd_train = mapper.fit(train_data)
            vecs_train = pd.DataFrame(famd_train.row_coordinates(train_data))
    
            # Vectors for testing set
            famd_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(famd_test.row_coordinates(test_data))
    
            # Normalise vectors (We can use l1, l2 or max norms)
            vecs_train = pd.DataFrame(preprocessing.normalize(vecs_train, norm = 'l2', axis = 1), columns = vecs_train.columns)
            vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns)
            
            ''' Consider returning a single dictionary. Each case has 
            different number of returned variables. Or make each case different method'''
            
            return vecs_train, train_target, vecs_test, test_target, mapper, target
        
        else: # If no samples are supplied we process the entire data set as one.
            
            test_data = data_frame.copy()
            test_target = pd.get_dummies(test_data[target]).astype('float64')
    
            # Drop the income column from data sets and get normalized vectors
            test_data = test_data.drop(columns = [target])
            famd_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(famd_test.row_coordinates(test_data))
            vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns)
            
            return vecs_test, test_target, mapper, target

    elif method == 'PCA': # PCA only works with numerical data. See below how we convert non numeric.
        
        if not mapper:
            
            mapper = pr.PCA(
                n_components = num_components,
                n_iter = 100,
                rescale_with_mean = True,
                rescale_with_std = True,
                copy = True,
                check_input = True,
                engine = 'auto',
                random_state = None
            )
        
        # Get all labels first
        labels = pd.get_dummies(data_frame[target]).astype('float64')
        
        # Remove the target class
        del data_frame[target]
        
        # Convert the data set. Each class of each feature is now a dummy feature
        # with 1. if it was present in the entry or 0. if not.
        data_frame = pd.get_dummies(data_frame).astype('float64')
        
        if samples is not None:
            
            train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:]
            test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:]

            train_target = labels.loc[samples.iloc[:,:-1].values.flatten(),:]
            test_target = labels.loc[samples.iloc[:,-1].values.flatten(),:]
    
            pca_train = mapper.fit(train_data )
            vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data))
            pca_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))

            #vecs_train = pd.DataFrame(preprocessing.normalize(vecs_train, norm = 'l2', axis = 1), columns = vecs_train.columns)
            #vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns)
            
            return vecs_train, train_target, vecs_test, test_target, mapper, target
        
        else:
            
            test_data = data_frame.copy()
            test_target = labels
    
            pca_test = mapper.fit(test_data)
            vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data))
            #vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns)

            return vecs_test, test_target, mapper, target
        
    elif method == 'Dummy':

        labels = pd.get_dummies(data_frame[target]).astype('float64')
        del data_frame[target]
        data_frame = pd.get_dummies(data_frame).astype('float64')
        
        if samples is not None:
            train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:]
            test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:]
    
            train_target = labels.loc[samples.iloc[:,:-1].values.flatten(),:]
            test_target = labels.loc[samples.iloc[:,-1].values.flatten(),:] 
            
            if not scaler:
                scaler = preprocessing.StandardScaler()
                vecs_train = pd.DataFrame(scaler.fit_transform(train_data), columns = train_data.columns)
                vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns)
            else: 
                '''
                    !!! NOTE: Need to fix. If scaler is supplied but fitted to different dimensional
                    data, it cannot be used and returns error. If test data do not contain
                    a classes of any feature or contains new classes, the dimensions of
                    the dummy data frame will be different. Also this issue will create problems
                    with Tensorflow's placeholders.
                '''
                try:
                    vecs_train = pd.DataFrame(scaler.transform(train_data), columns = train_data.columns)
                    vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns)
                except:
                    vecs_train = pd.DataFrame(scaler.fit_transform(train_data), columns = train_data.columns)
                    vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns)
       
            return vecs_train, train_target, vecs_test, test_target, scaler, target
        
        else:
            test_data = data_frame.copy()
            test_target = labels
            
            if not scaler:
                scaler = preprocessing.StandardScaler()
                vecs_test = pd.DataFrame(scaler.fit_transform(test_data), columns = train_data.columns)
            else:
                try:
                    vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns)
                except:
                    vecs_test = pd.DataFrame(scaler.fit_transform(test_data), columns = test_data.columns)
            
            return vecs_test, test_target, scaler, target

Example #10

0

Show file

id_cluster_df.head()

# + hidden=true
id_dummies_df = pd.get_dummies(id_cluster_df)
id_dummies_df.head()

# + hidden=true
id_dummies_df.shape

# + hidden=true
import prince

famd = prince.FAMD(
    n_components=39,
    n_iter=10,
    copy=True,
    check_input=True,
    engine='auto',  ## Can be "auto", 'sklearn', 'fbpca'
    random_state=42)

## Fit FAMD object to data
famd = famd.fit(id_cluster_df)  ## Exclude target variable "Churn"

famd_data = famd.row_coordinates(id_cluster_df)

# + hidden=true
np.sum(famd.explained_inertia_)

# + hidden=true
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(id_dummies_df)

Example #11

0

Show file

test_data = test_data.drop(columns=['income'])

to_encode = data_frame['income'].copy()
labels, fetures = encode_categorical_S(to_encode)
train_target = pd.DataFrame(labels.loc[train_data.index.values],
                            columns=['income'])
test_target = pd.DataFrame(labels.loc[test_data.index.values],
                           columns=['income'])

#%%
# Use FAMD (Factor Analysis for Mixed Data), to reduse the dimensions of the data set
# and convert categorical data to numeric form.
''' Rename test to valid'''
famd = pr.FAMD(n_components=5,
               n_iter=10,
               copy=True,
               check_input=True,
               engine='auto',
               random_state=None)

famd_train = famd.fit(train_data)
vecs_train = famd_train.row_coordinates(train_data)
famd_test = famd.fit(test_data)
vecs_test = famd_test.row_coordinates(test_data)

scaler = preprocessing.StandardScaler()
vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train),
                          columns=vecs_train.columns)
vecs_test = pd.DataFrame(scaler.transform(vecs_test),
                         columns=vecs_test.columns)

#%% Model

Example #12

0

Show file

def main():
    st.title("Hepatocellular Carcinoma Clustering Model")
    url = "https://raw.githubusercontent.com/taylor-m/hcc_clustering/main/hcc_data/hcc-data.csv"
    raw = pd.read_csv(url)
    cols = [
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Age",
        "Grams_day",
        "Packs_year",
        "PS",
        "Encephalopathy",
        "Ascites",
        "INR",
        "AFP",
        "Hemoglobin",
        "MCV",
        "Leucocytes",
        "Platelets",
        "Albumin",
        "Total_Bil",
        "ALT",
        "AST",
        "GGT",
        "ALP",
        "TP",
        "Creatinine",
        "Nodule",
        "Major_Dim",
        "Dir_Bil",
        "Iron",
        "Sat",
        "Ferritin",
        "Class",
    ]
    st.set_option('deprecation.showPyplotGlobalUse', False)

    #=========================================================================
    # DATA FUNCTIONS
    def load_data():
        df = pd.read_csv(url, names=cols)

        # changing the ? input values to np.nan for the imputer
        df[df == "?"] = np.nan

        # using the KNN imputer to impute the missing values
        imputer = KNNImputer(missing_values=np.nan)
        imputed = imputer.fit_transform(df)

        # creating a new df from the imputed array
        df = pd.DataFrame(imputed, columns=cols)
        cats = [
            "Gender",
            "Symptoms",
            "Alcohol",
            "HBsAg",
            "HBeAg",
            "HBcAb",
            "HCVAb",
            "Cirrhosis",
            "Endemic",
            "Smoking",
            "Diabetes",
            "Obesity",
            "Hemochro",
            "AHT",
            "CRI",
            "HIV",
            "NASH",
            "Varices",
            "Spleno",
            "PHT",
            "PVT",
            "Metastasis",
            "Hallmark",
            "Class",
            "PS",
            "Encephalopathy",
            "Ascites",
            "Nodule",
        ]

        # rounding all the values in cat columns because imputed values weren't binary
        for cat in cats:
            df[cat] = round(df[cat]).astype(int)
        return df

    def scale_df(df):
        # scale data for cluster
        scaler = StandardScaler()
        # looking at df with target var first
        X_scaled = scaler.fit_transform(df)
        X_scaled = pd.DataFrame(X_scaled, columns=df.columns, index=df.index)
        return X_scaled

    def create_dmat(scaled):
        # distance matrix
        dist = pdist(scaled, metric="cosine")
        dmat = squareform(dist)
        return dmat

    def kmed_cluster(df, k):
        # generate k random indices from distance matrix
        df2 = df.drop(columns="Class")

        X_scaled = scale_df(df2)
        dmat = create_dmat(X_scaled)

        np.random.seed(42)
        n_rows = dmat.shape[0]
        init_medoids = np.random.randint(0, n_rows, k)

        # init_medoids
        kmed = kmedoids(dmat,
                        initial_index_medoids=init_medoids,
                        data_type="distance_matrix")
        kmed.process()
        clusters = kmed.get_clusters()
        medoid_idxs = kmed.get_medoids()
        # medoid_idxs

        labels = kmed.predict(dmat)
        df["kmed"] = labels
        # print(df.kmed.value_counts())
        # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient()

        # casting kmed clusters to strings
        df.kmed = df.kmed.astype(str)

        # reordering cluster numbers by mortality rate
        df.loc[(df.kmed == "3"), "kmed"] = 4
        df.loc[(df.kmed == "0"), "kmed"] = 2
        df.loc[(df.kmed == "1"), "kmed"] = 3
        df.loc[(df.kmed == "4"), "kmed"] = 1
        df.loc[(df.kmed == "2"), "kmed"] = 0

        group_df = df.groupby("kmed").mean().style.background_gradient()
        # counts = df.kmed.value_counts().index.sort_values(ascending=False)
        # group_df["count"] = counts
        return df, group_df, clusters, dmat

    #=========================================================================
    # load data and impute missing values
    df = load_data()
    # run K-medoid cluster model w/ K = 5
    df, group_df, clusters, dmat = kmed_cluster(df, 5)
    st.sidebar.title("Model")
    #=========================================================================
    cols = [
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Age",
        "Grams_day",
        "Packs_year",
        "PS",
        "Encephalopathy",
        "Ascites",
        "INR",
        "AFP",
        "Hemoglobin",
        "MCV",
        "Leucocytes",
        "Platelets",
        "Albumin",
        "Total_Bil",
        "ALT",
        "AST",
        "GGT",
        "ALP",
        "TP",
        "Creatinine",
        "Nodule",
        "Major_Dim",
        "Dir_Bil",
        "Iron",
        "Sat",
        "Ferritin",
        "Class",
    ]
    bin_cols = [
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Class",
    ]
    yes_no = [
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
    ]
    num_cols = [
        "Age",
        "Grams_day",
        "Packs_year",
        "INR",
        "AFP",
        "Hemoglobin",
        "MCV",
        "Leucocytes",
        "Platelets",
        "Albumin",
        "Total_Bil",
        "ALT",
        "AST",
        "GGT",
        "ALP",
        "TP",
        "Creatinine",
        "Major_Dim",
        "Dir_Bil",
        "Iron",
        "Sat",
        "Ferritin",
    ]
    cat_cols = ["PS", "Encephalopathy", "Ascites", "Nodule"]
    #=========================================================================
    analysis_vars = [
        "Iron",
        "Ferritin",
        "Dir_Bil",
        "Nodule",
        "GGT",
        "ALP",
        "Total_Bil",
        "Albumin",
        "Platelets",
        "Hemoglobin",
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Age",
        "Grams_day",
        "Packs_year",
        "PS",
        "Encephalopathy",
        "Ascites",
        "INR",
        "AFP",
        "MCV",
        "Leucocytes",
        "ALT",
        "AST",
        "TP",
        "Creatinine",
        "Major_Dim",
        "Sat",
        "Class",
    ]
    #=========================================================================
    num_vars = [
        "Age",
        "Grams_day",
        "Packs_year",
        "INR",
        "AFP",
        "Hemoglobin",
        "MCV",
        "Leucocytes",
        "Platelets",
        "Albumin",
        "Total_Bil",
        "ALT",
        "AST",
        "GGT",
        "ALP",
        "TP",
        "Creatinine",
        "Major_Dim",
        "Dir_Bil",
        "Iron",
        "Sat",
        "Ferritin",
        "Nodule",
        "PS",
        "Encephalopathy",
        "Ascites",
    ]
    #=========================================================================
    cat_vars = [
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Class",
    ]
    #=========================================================================
    var_dict = {
        "Gender": ["0=female|1=male"],
        "HBsAg": [
            "Hepatitis B surface Antigen\nHBsAg is the surface antigen of the hepatitis B virus. It indicates current hepatitis B infection.[10] An antigen is a protein that stimulates an immune system response, causing your body to produce antibodies to fight invaders. In hepatitis B, it's common to test for the hepatitis B surface antigen (HBsAg) and hepatitis B core antigen (HBcAg). These antigens are attached to the inside and the outside of the virus.[11]"
        ],
        "HBeAg": [
            "Hepatitis B e Antigen\nHBeAg stands for hepatitis B e-antigen. This antigen is a protein from the hepatitis B virus that circulates in infected blood when the virus is actively replicating. The presence of HBeAg suggests that the person is infectious and is able to spread the virus to other people.[9]"
        ],
        "HBcAb": [
            "Hep B core Antibody\nTotal hepatitis B core antibody (anti-HBc): Appears at the onset of symptoms in acute hepatitis B and persists for life. The presence of anti-HBc indicates previous or ongoing infection with hepatitis B virus in an undefined time frame. IgM antibody to hepatitis B core antigen (IgM anti-HBc)[8]"
        ],
        "HCVAb": [
            "Hep C Virus Antibody\nWhat is hepatitis C virus antibody? A reactive or positive antibody test means you have been infected with the hepatitis C virus at some point in time. Once people have been infected, they will always have antibodies in their blood. This is true if they have cleared the virus, have been cured, or still have the virus in their blood.[7]"
        ],
        "Endemic": ["Endemic Countries"],
        "Hemochro": [
            "Hemochromatosis\nHemochromatosis is the abnormal accumulation of iron in parenchymal organs, leading to organ toxicity. It is the most common autosomal recessive genetic disorder and the most common cause of severe iron overload.\nClinical manifestations of hemochromatosis include the following:\n\tLiver disease (hepatomegaly, 13%; cirrhosis, 13%, usually late in the disease)[6]"
        ],
        "AHT": ["Arterial Hypertension"],
        "CRI": [
            "Chronic Renal Insufficiency\nRenal insufficiency is poor function of the kidneys that may be due to a reduction in blood-flow to the kidneys caused by renal artery disease."
        ],
        "HIV": ["Human Immunodeficiency Virus"],
        "NASH": [
            "Nonalcoholic Steatohepatitis\nThe term nonalcoholic steatohepatitis (NASH) was first coined by Dr. Ludwig 3 decades ago to describe a unique entity characterized by fatty changes with lobular hepatitis in the absence of a history of alcoholism\nAt present, nonalcoholic fatty liver disease (NAFLD) has become the most common liver disease in the United States and, indeed, worldwide."
        ],
        "Varices": [
            "Esophageal Varices\nEsophageal varices are enlarged veins in the lower esophagus. They're often due to obstructed blood flow through the portal vein, which carries blood from the intestine and spleen to the liver. Esophageal varices are abnormal, enlarged veins in the tube that connects the throat and stomach (esophagus)[5]"
        ],
        "Spleno": [
            "Splenomegaly\nSplenomegaly is a condition that occurs when your spleen becomes enlarged. It's also commonly referred to as enlarged spleen or spleen enlargement. The spleen is a part of your lymphatic system. It helps the immune system by storing white blood cells and helping in the creation of antibodies.[3]\nMany conditions — including infections, liver disease and some cancers — can cause an enlarged spleen.[4]"
        ],
        "PHT": [
            "Portal Hypertension\nPortal hypertension is an increase in the pressure within the portal vein, which carries blood from the digestive organs to the liver. The most common cause is cirrhosis of the liver, but thrombosis (clotting) might also be the cause.[2]"
        ],
        "PVT": [
            "Portal Vein Thrombosis\nPortal vein thrombosis is blockage or narrowing of the portal vein (the blood vessel that brings blood to the liver from the intestines) by a blood clot.\nFluid accumulation in the abdomen (called ascites) is not common. But it may develop when people also have liver congestion (backup of blood in the liver) or liver damage, such as severe scarring of the liver (cirrhosis), or when large amounts of fluids are given intravenously to treat massive bleeding from ruptured varicose veins in the esophagus or stomach. If portal vein thrombosis develops in people with cirrhosis, their condition deteriorates.[1]"
        ],
        "Metastasis": ["Liver Metastasis"],
        "Hallmark": ["Radiological Hallmark"],
        "Age": ["Age at diagnosis"],
        "Grams_day": ["Grams of Alcohol per day"],
        "Packs_year": ["Packs of cigarets per year"],
        "PS": [
            "Performance Status:\n0=Active\n1=Restricted\n2=Ambulatory\n3=Selfcare\n4=Disabled\n5=Dead"
        ],
        "Encephalopathy":
        ["Encephalopathy degree:\n1=None\n2=Grade I/II\n3=Grade III/IV"],
        "Ascites": ["Ascites degree:\n1=None\n2=Mild\n3=Moderate to Severe"],
        "INR": [
            "International Normalised Ratio. This blood test looks to see how well your blood clots.\n\nThe international normalized ratio (INR) is a standardized number that's figured out in the lab. If you take blood thinners, also called anti-clotting medicines or anticoagulants, it may be important to check your INR. The INR is found using the results of the prothrombin time (PT) test. This measures the time it takes for your blood to clot. The INR is an international standard for the PT.",
        ],
        "AFP": [
            "Alpha-Fetoprotein; An AFP tumor marker test is a blood test that measures the levels of AFP in adults. Tumor markers are substances made by cancer cells or by normal cells in response to cancer in the body. High levels of AFP can be a sign of liver cancer or cancer of the ovaries or testicles, as well as noncancerous liver diseases such as cirrhosis and hepatitis.\n\nHigh AFP levels don't always mean cancer, and normal levels don't always rule out cancer. So an AFP tumor marker test is not usually used by itself to screen for or diagnose cancer. But it can help diagnose cancer when used with other tests. The test may also be used to help monitor the effectiveness of cancer treatment and to see if cancer has returned after you've finished treatment.",
        ],
        "Hemoglobin": [
            "Hemoglobin is the protein molecule in red blood cells that carries oxygen from the lungs to the body's tissues and returns carbon dioxide from the tissues back to the lungs. Higher than normal hemoglobin levels can be seen in people living at high altitudes and in people who smoke and infrequently with certain tumors.",
        ],
        "MCV": [
            "Mean Corpuscular Volume; An MCV blood test measures the average size of your red blood cells. Larger than normal RBCs may indicate liver disease.",
        ],
        "Leucocytes": [
            "white blood cells",
        ],
        "Platelets": [
            "Platelets, or thrombocytes, are small, colorless cell fragments in our blood that form clots and stop or prevent bleeding.",
        ],
        "Albumin": [
            "Albumin is a protein made by your liver. Albumin helps keep fluid in your bloodstream so it doesn't leak into other tissues. It is also carries various substances throughout your body, including hormones, vitamins, and enzymes. Low albumin levels can indicate a problem with your liver or kidneys.",
        ],
        "Total_Bil": [
            "Total Bilirubin; This is a blood test that measures the amount of a substance called bilirubin. This test is used to find out how well your liver is working. It is often part of a panel of tests that measure liver function. A small amount of bilirubin in your blood is normal, but a high level may be a sign of liver disease.",
        ],
        "ALT": [
            "Alanine aminotransferase (ALT) is an enzyme found mostly in the cells of the liver and kidney. Much smaller amounts of it are also found in the heart and muscles. Normally, ALT levels in blood are low, but when the liver is damaged, ALT is released into the blood and the level increases.",
        ],
        "AST": [
            "Aspartate aminotransferase (AST) is an enzyme found in cells throughout the body but mostly in the heart and liver and, to a lesser extent, in the kidneys and muscles. In healthy individuals, levels of AST in the blood are low. When liver or muscle cells are injured, they release AST into the blood.",
        ],
        "GGT": [
            "Gamma glutamyl transferase (GGT) is an enzyme found in cell membranes of many tissues mainly in the liver, kidney, and pancreas. [1] It is also found in other tissues including intestine, spleen, heart, brain, and seminal vesicles. The highest concentration is in the kidney, but the liver is considered the source of normal enzyme activity.",
        ],
        "ALP": [
            "Alkaline phosphatase; The alkaline phosphatase test (ALP) is used to help detect liver disease or bone disorders. It is often ordered along with other tests, such as a gamma-glutamyl transferase (GGT) test and/or as part of a liver panel. In conditions affecting the liver, damaged liver cells release increased amounts of ALP into the blood.",
        ],
        "TP": [
            "The total protein test measures the total amount of two classes of proteins found in the fluid portion of your blood. These are albumin and globulin. Proteins are important parts of all cells and tissues. Albumin helps prevent fluid from leaking out of blood vessels. Low levels can be indicative of liver disease.",
        ],
        "Creatinine": [
            "Creatinine is critically important in assessing renal function because it has several interesting properties. In blood, it is a marker of glomerular filtration rate;",
        ],
        "Nodule": [
            "Number of Nodules",
        ],
        "Major_Dim": [
            "Major dimension of nodule",
        ],
        "Dir_Bil": [
            "Direct Bilirubin; Bilirubin is a tetrapyrrole and a breakdown product of heme catabolism. Most bilirubin (70%-90%) is derived from hemoglobin degradation and, to a lesser extent, from other hemo proteins. In the serum, bilirubin is usually measured as both direct bilirubin (DBil) and total-value bilirubin",
        ],
        "Iron": [
            "The amount of circulating iron bound to transferrin is reflected by the serum iron level.",
        ],
        "Sat": [
            "Oxygen Saturation",
        ],
        "Ferritin": [
            "Ferritin is the cellular storage protein for iron. It is present in small concentrations in blood, and the serum ferritin concentration normally correlates well with total-body iron stores, making its measurement important in the diagnosis of disorders of iron metabolism.",
        ],
        "Class": ["1=lives\n0=dies\n@ 1 year"],
    }
    #=========================================================================
    plot_dict = {
        "Gender": "0=female|1=male",
        "HBsAg": "Hepatitis B surface Antigen",
        "HBeAg": "Hepatitis B e Antigen",
        "HBcAb": "Hep B core Antibody",
        "HCVAb": "Hep C Virus Antibody",
        "Endemic": "Endemic Countries",
        "Hemochro": "Hemochromatosis",
        "AHT": "Arterial Hypertension",
        "CRI": "Chronic Renal Insufficiency",
        "HIV": "Human Immunodeficiency Virus",
        "NASH": "Nonalcoholic Steatohepatitis",
        "Varices": "Esophageal Varices",
        "Spleno": "Splenomegaly",
        "PHT": "Portal Hypertension",
        "PVT": "Portal Vein Thrombosis",
        "Metastasis": "Liver Metastasis",
        "Hallmark": "Radiological Hallmark",
        "Age": "Age at diagnosis",
        "Grams_day": "Grams of Alcohol per day",
        "Packs_year": "Packs of cigarets per year",
        "PS":
        "Performance Status:\n0=Active\n1=Restricted\n2=Ambulatory\n3=Selfcare\n4=Disabled\n5=Dead",
        "Encephalopathy":
        "Encephalopathy degree:\n1=None\n2=Grade I/II\n3=Grade III/IV",
        "Ascites": "Ascites degree:\n1=None\n2=Mild\n3=Moderate to Severe",
        "INR": "International Normalised Ratio",
        "AFP": "Alpha-Fetoprotein (ng/mL)",
        "Hemoglobin": "(g/dL)",
        "MCV": "Mean Corpuscular Volume (fl)",
        "Leukocytes": "(G/L)",
        "Platelets": "(G/L)",
        "Albumin": "(mg/dL)",
        "Total_Bil": "Total Bilirubin (mg/dL)",
        "ALT": "Alanine transaminase (U/L)",
        "AST": "Aspartate transaminase (U/L)",
        "GGT": "Gamma glutamyl transferase (U/L)",
        "ALP": "Alkaline phosphatase (U/L)",
        "TP": "Total Proteins (g/dL)",
        "Creatinine": "(mg/dL)",
        "Nodules": "Number of Nodules",
        "Major Dim": "Major dimension of nodule (cm)",
        "Dir Bil": "Direct Bilirubin (mg/dL)",
        "Iron": "(mcg/dL)",
        "Sat": "Oxygen Saturation (%)",
        "Ferritin": "(ng/mL)",
        "Class": "1=lives\n0=dies\n@ 1 year",
    }
    #=========================================================================
    cats = [
        "Gender",
        "Symptoms",
        "Alcohol",
        "HBsAg",
        "HBeAg",
        "HBcAb",
        "HCVAb",
        "Cirrhosis",
        "Endemic",
        "Smoking",
        "Diabetes",
        "Obesity",
        "Hemochro",
        "AHT",
        "CRI",
        "HIV",
        "NASH",
        "Varices",
        "Spleno",
        "PHT",
        "PVT",
        "Metastasis",
        "Hallmark",
        "Class",
        "PS",
        "Encephalopathy",
        "Ascites",
        "Nodule",
    ]
    #=========================================================================
    lab_values = {
        "INR": [
            0,
            1.1,
            "",
        ],
        "AFP": [
            0,
            10,
            "ng/mL",
        ],
        "Hemoglobin": [
            12,
            18,
            "g/dL",
        ],
        "MCV": [
            80,
            100,
            "fl",
        ],
        "Leucocytes": [
            4,
            11,
            "G/L",
        ],
        "Platelets": [
            150000,
            450000,
            "G/L",
        ],
        "Albumin": [
            3.4,
            5.4,
            "mg/dL",
        ],
        "Total_Bil": [
            0,
            1,
            "mg/dL",
        ],
        "ALT": [
            29,
            33,
            "U/L",
        ],
        "AST": [
            0,
            35,
            "U/L",
        ],
        "GGT": [
            5,
            40,
            "U/L",
        ],
        "ALP": [
            44,
            147,
            "U/L",
        ],
        "TP": [
            6,
            8.3,
            "g/dL",
        ],
        "Creatinine": [
            0.5,
            1.2,
            "mg/dL",
        ],
        "Dir_Bil": [
            0.1,
            0.3,
            "mg/dL",
        ],
        "Iron": [
            60,
            180,
            "mcg/dL",
        ],
        "Sat": [
            95,
            100,
            "%",
        ],
        "Ferritin": [
            10,
            300,
            "ng/mL",
        ],
    }

    #=========================================================================
    # CLUSTERING FUNCTIONS
    def kmed_predict(df, X, k=5):
        # generate k random indices from distance matrix
        # df = df.drop(columns="Class")
        # X = X.drop(columns="kmed")
        # X = pd.DataFrame(X, columns=df.columns)
        df = df.append(X)
        X_scaled = scale_df(df)
        dmat = create_dmat(X_scaled)

        np.random.seed(42)
        n_rows = dmat.shape[0]
        init_medoids = np.random.randint(0, n_rows, k)

        # init_medoids
        kmed = kmedoids(dmat,
                        initial_index_medoids=init_medoids,
                        data_type="distance_matrix")
        kmed.process()

        medoid_idxs = kmed.get_medoids()
        # medoid_idxs

        labels = kmed.predict(dmat)
        df["kmed"] = labels
        # print(df.kmed.value_counts())
        # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient()

        # casting kmed clusters to strings
        df.kmed = df.kmed.astype(str)

        # reordering cluster numbers by mortality rate
        df.loc[(df.kmed == "3"), "kmed"] = 4
        df.loc[(df.kmed == "0"), "kmed"] = 2
        df.loc[(df.kmed == "1"), "kmed"] = 3
        df.loc[(df.kmed == "4"), "kmed"] = 1
        df.loc[(df.kmed == "2"), "kmed"] = 0

        group_df = df.groupby("kmed").mean().style.background_gradient()
        # counts = df.kmed.value_counts().index.sort_values(ascending=False)
        # group_df["count"] = counts
        output = df.head(-1)
        return df

    # PLOTTING FUNCTIONS
    def plot_boxplot(var):
        fig = go.Figure()
        # for i in range(df.kmed.unique()):
        fig.add_trace(
            go.Box(
                y=df[var],
                x=df.kmed,
                boxpoints=False,  # no data points
                #     marker_color='rgb(9,56,125)',
                #     line_color='rgb(9,56,125)'
            ))

        # add min and max range lines for lab values
        if var in lab_values.keys():
            fig.update_layout(
                title=f"{var} Values of Risk Clusters",
                xaxis_title="Risk Clusters",
                yaxis_title=f"{var} values ({lab_values[var][2]})")
            fig.add_hrect(y0=lab_values[var][0],
                          y1=lab_values[var][1],
                          line_width=0,
                          fillcolor="green",
                          opacity=0.2)

        else:
            fig.update_layout(title=f"{var} Values of Risk Clusters",
                              xaxis_title="Risk Clusters",
                              yaxis_title=f"{var}")
        st.plotly_chart(fig)

    def plot_violin(var):
        fig = go.Figure()
        fig.add_trace(go.Violin(
            y=df[var],
            x=df.kmed,
        ))
        fig.update_traces(meanline_visible=True)

        # add min and max range lines for lab values
        if var in lab_values.keys():
            fig.update_layout(
                title=f"{var} Values of Risk Clusters",
                xaxis_title="Risk Clusters",
                yaxis_title=f"{var} values ({lab_values[var][2]})")
            fig.add_hrect(y0=lab_values[var][0],
                          y1=lab_values[var][1],
                          line_width=0,
                          fillcolor="green",
                          opacity=0.2)
        else:
            fig.update_layout(title=f"{var} Values of Risk Clusters",
                              xaxis_title="Risk Clusters",
                              yaxis_title=f"{var}")
        st.plotly_chart(fig)

    def plot_barplot(var):
        fig = go.Figure(data=[])
        if var in lab_values.keys() or var in cat_cols:
            for val in df[var].unique():
                fig.add_trace(
                    go.Bar(name=f"{var} = {val}",
                           x=df.kmed,
                           y=df[df[var] == val][var]))
            fig.update_layout(barmode='stack')
        elif var == "Gender":
            fig.add_trace(
                go.Bar(name="female", x=df.kmed, y=(df["Gender"] == 1)))
            fig.add_trace(go.Bar(name="male", x=df.kmed,
                                 y=(df["Gender"] == 0)))
        else:
            fig.add_trace(go.Bar(name="No", x=df.kmed, y=(df[var] == 1)))
            fig.add_trace(go.Bar(name="Yes", x=df.kmed, y=(df[var] == 0)))
        fig.update_layout(title=f"{var} by Risk Cluster",
                          xaxis_title="Risk Cluster",
                          yaxis_title=f"{var}")
        st.plotly_chart(fig)

    def plot_hist(var, cluster_num):
        cluster_df = df[df.kmed == cluster_num]
        fig = px.histogram(cluster_df, x=var)
        st.plotly_chart(fig)

    #=========================================================================
    option = st.sidebar.selectbox(
        "Model Options",
        ("Objective", "Data", "Cluster Analysis", "Cluster Predict", "Source"))
    #=========================================================================
    if option == "Data":
        st.subheader("Dataset")
        if st.sidebar.checkbox("full data", False):
            st.write(df)
        else:
            st.write(df.head(10))
        st.write(f"Number of samples: {df.shape[0]}")
        # st.write()
        st.subheader("Variables")
        st.write("""
            \n1. Gender 
            \n\t\t0=female|1=male
            \n2. Symptoms
            \n3. Alcohol
            \n4. HBsAg - Hep B surface Antigen
            \n5. HBeAg - Hep B e Antigen
            \n6. HBcAb - Hep B core Antibody
            \n7. HCVAb - Hep C Virus Antibody
            \n8. Cirrhosis
            \n9. Endemic Countries
            \n10. Smoking
            \n11. Diabetes
            \n12. Obesity
            \n13. Hemochromatosis
            \n14. AHT - Arterial Hypertension
            \n15. CRI - Chronic Renal Insufficiency
            \n16. HIV - Human Immunodeficiency Virus
            \n17. NASH - Nonalcoholic Steatohepatitis
            \n18. Varices - Esophageal Varices
            \n19. Spleno - Splenomegaly
            \n20. PHT - Portal Hypertension
            \n21. PVT - Portal Vein Thrombosis
            \n22. Metastasis - Liver Metastasis
            \n23. Hallmark - Radiological Hallmark
            \n24. Age - Age at diagnosis
            \n25. Grams/day - Grams of Alcohol per day
            \n26. Packs/year - Packs of cigarets per year
            \n27. PS - Performance Status 
                    \n\t\t[0=Active;1=Restricted;2=Ambulatory;3=Selfcare;4=Disabled;5=Dead]
            \n28. Encephalopathy - Encephalopathy degree
                    \n\t\t[1=None;2=Grade I/II; 3=Grade III/IV]
            \n29. Ascites - Ascites degree
                    \n\t\t[1=None;2=Mild;3=Moderate to Severe]
            \n30. INR - International Normalised Ratio
            \n31. AFP - Alpha-Fetoprotein (ng/mL)
            \n32. Hemoglobin (g/dL)
            \n33. MCV - Mean Corpuscular Volume (fl)
            \n34. Leukocytes(G/L)	
            \n35. Platelets	(G/L)
            \n36. Albumin (mg/dL)
            \n37. Total Bilirubin(mg/dL)
            \n38. ALT - Alanine transaminase (U/L)
            \n39. AST - Aspartate transaminase (U/L)
            \n40. GGT - Gamma glutamyl transferase (U/L)
            \n41. ALP - Alkaline phosphatase (U/L)
            \n42. TP - Total Proteins (g/dL)
            \n43. Creatinine (mg/dL)
            \n44. Nodules - Number of Nodules
            \n45. Major Dim - Major dimension of nodule (cm)
            \n46. Dir Bil - Direct Bilirubin (mg/dL)
            \n47. Iron	(mcg/dL)
            \n48. Sat - Oxygen Saturation (%)
            \n49. Ferritin (ng/mL)
            \n50. Class (1=lives;0=dies) @ 1 year
            """)
    #=========================================================================
    if option == "Source":
        st.write("""
            Data Set Name: 
            \nHepatocellular Carcinoma Dataset (HCC dataset)

            \n\nAbstract: 
            \nHepatocellular Carcinoma dataset (HCC dataset) was collected at a University Hospital in Portugal. It contains real clinical data of 165 patients diagnosed with HCC.

            \n\nDonors:
            \nMiriam Seoane Santos ([email protected]) and Pedro Henriques Abreu ([email protected]), Department of Informatics Engineering, Faculty of Sciences and Technology, University of Coimbra
            . Armando Carvalho ([email protected]) and Adélia Simão ([email protected]), Internal Medicine Service, Hospital and University Centre of Coimbra

            \n\nData Type: Multivariate
            \nTask: Classification, Regression, Clustering, Casual Discovery
            \nAttribute Type: Categorical, Integer and Real

            \n\nArea: Life Sciences
            \n\nFormat Type: Matrix
            \n\nMissing values: Yes

            \n\nInstances and Attributes:
            \nNumber of Instances (records in your data set): 165
            \nNumber of attributes (fields within each record): 49

            \n\nRelevant Information:
            \nHCC dataset was obtained at a University Hospital in Portugal and contais several demographic, risk factors, laboratory and overall survival features of 165 real patients diagnosed with HCC. The dataset contains 49 features selected according to the EASL-EORTC (European Association for the Study of the Liver - European Organisation for Research and Treatment of Cancer) Clinical Practice Guidelines, which are the current state-of-the-art on the management of HCC.

            \n\nThis is an heterogeneous dataset, with 23 quantitative variables, and 26 qualitative variables. Overall, missing data represents 10.22% of the whole dataset and only eight patients have complete information in all fields (4.85%). The target variables is the survival at 1 year, and was encoded as a binary variable: 0 (dies) and 1 (lives). A certain degree of class-imbalance is also present (63 cases labeled as “dies” and 102 as “lives”).

            \n\nA detailed description of the HCC dataset (feature’s type/scale, range, mean/mode and missing data percentages) is provided in Santos et al. “A new cluster-based oversampling method for improving survival prediction of hepatocellular carcinoma patients”, Journal of biomedical informatics, 58, 49-59, 2015.
            """)
    #=========================================================================
    analysis_dict = {
        "Ferritin": "- ~12-112% higher mean levels",
        "Iron":
        "- significantly higher iron level distribution in high risk cluster\n- lower iron level distributions in medium risk clusters",
        "Dir_Bil":
        "- highest levels in samples in risk cluster 4\n- cluster 3 shows slightly higher levels\n- clusters 0-2 have low distributions",
        "Major_Dim": "indistinct",
        "Gender": "",
        "HBsAg": "",
        "HBeAg": "",
        "HBcAb":
        "- the highest risk group has a higher ratio of samples without the HBcAb than the others",
        "HCVAb":
        "- the highest risk group has a higher ratio of samples without the HBcAb than the others",
        "Endemic": "",
        "Hemochro": "",
        "AHT": "",
        "CRI": "",
        "HIV": "",
        "NASH": "",
        "Varices": "",
        "Spleno": "",
        "PHT": "",
        "PVT": "",
        "Metastasis": "",
        "Hallmark": "",
        "Age": "",
        "Grams_day": "",
        "Packs_year": "",
        "PS": "",
        "Encephalopathy":
        "risk clusters 0,1 have even distributions throughout; 2-4 heavy grouping around 1",
        "Ascites":
        "risk clusters 0,1 have even distributions throughout; 2-4 heavy grouping around 1",
        "INR":
        "risk cluster 4 has a distribution between the others; 2,3 being lower, 0,1 being higher",
        "AFP": "cluster 4, 2 have different distributions from other clusters",
        "Hemoglobin": "- elevated in high risk groups",
        "MCV": "inconclusive",
        "Leucocytes": "- same relative pattern as platelet var",
        "Platelets":
        "- clusters 0,3 are low/high risk clusters respectively yet have similar platelet levels\n- cluster 2 has highest platelet levels",
        "Albumin": "- elevated levels in high risk groups",
        "Total_Bil":
        "- risk clusters 2-4 all show bottom heavy total bilirubin levels\n- clusters 2 has similar levels yet is a lower risk group",
        "ALT": "inconclusive",
        "AST": "inconclusive",
        "GGT": "- lower risk groups have bottom heavy distributions",
        "ALP":
        "- higher risk groups have higher median levels & generally higher distributions",
        "TP": "inconclusive",
        "Creatinine": "indistinct",
        "Nodule": "- higher risk cluster, bottom heavy distribution",
    }
    #=========================================================================
    interest_vars = [
        "Ferritin",
        "Dir_Bil",
        "GGT",
        "ALP",
        "HBcAb",
        "HCVAb",
        "Smoking",
        "AHT",
        "Metastasis",
    ]
    #=========================================================================
    if option == "Cluster Analysis":
        st.subheader("Cluster Analysis")
        # st.write(f"Data Class Mean: {df.Class.mean()}")
        # plot_type = st.sidebar.radio("Plot Type", ["Boxplot", "Violin"])

        # options for overall chart options
        st.sidebar.subheader("Chart Options:")
        if st.sidebar.checkbox("View Dataframe", False):
            st.write(df)
        if st.sidebar.checkbox("Gradient", False):
            group_df
        if st.sidebar.checkbox("Counts", False):
            st.write(df.kmed.value_counts())
        if st.sidebar.checkbox("Plot Clusters", False):
            df_copy = df.copy()
            for cat in cats:
                df_copy[cat] = df_copy[cat].astype(str)
            # fig, ax = plt.subplots()
            model = prince.FAMD()
            famd = model.fit(df_copy)
            coordinates = famd.transform(df_copy)

            famd.plot_row_coordinates(df_copy, color_labels=df_copy.kmed)
            st.pyplot()
        if st.sidebar.checkbox("Interest Vars", False):
            interest_vars

        # option for var cluster visualization
        plot_var = st.sidebar.selectbox("Variable", analysis_vars)
        # option for viewing information from analysis var dict about var analysis notes
        notes = st.sidebar.checkbox("Cluster Notes")

        if plot_var not in cat_vars:
            st.sidebar.header("Plot Type:")
            box = st.sidebar.checkbox("Boxplot")
            violin = st.sidebar.checkbox("Violin Plot")
            hist = st.sidebar.checkbox("Histogram")
            if hist:
                cluster_num = st.sidebar.selectbox("Cluster #",
                                                   df.kmed.unique())
            if st.sidebar.button("Plot", False):
                st.subheader(plot_var)
                st.write(var_dict[plot_var][0])
                if notes:
                    st.subheader("Analysis Notes")
                    st.write(analysis_dict[plot_var])
                if violin:
                    plot_violin(plot_var)
                if box:
                    plot_boxplot(plot_var)
                if hist:
                    st.write(f"Cluster {cluster_num}, {plot_var}")
                    plot_hist(plot_var, cluster_num)

                # if plot_var == "AFP":
                # st.image(\U'c:\Users\tayma\github\hcc_clustering\afp_table.png')
        else:
            if st.sidebar.button("Plot", False):
                st.write(var_dict[plot_var][0])
                if notes:
                    st.subheader("Analysis Notes")
                    st.write(analysis_dict[plot_var])
                plot_barplot(plot_var)

        # sidebar selectbox with info about normal ranges/values for health data
        # info_var = st.sidebar.selectbox("Var Info", cols)

    #=========================================================================
    if option == "Objective":
        st.subheader("Clustering Model Objective")
        st.write('''
            The Hepatocellular Carcinoma dataset (HCC dataset) was collected at a University Hospital in Portugal. It contains real clinical data of 165 patients diagnosed with HCC. 
            The purpose of my clustering model will be to create clusters that have a distribution of the mortality class average that will allow for cluster analysis to identify 
            features of interest used in determining the effect on mortality rates in patients with HCC.
            ''')
        st.subheader("End User Value:")
        st.write('''
            The value in the clustering analysis of the HCC dataset is providing additional insight into the ideal characteristics that compose the clusters having a lower overall 
            mortality rate relative to the higher mortality groups.
            ''')
        st.subheader("Quantifiable Results:")
        st.write('''
            Results are assessed based on the distribution of mortality rates across cluster groups. The dataset provides a target variable; however, clustering is done without 
            introducing the target variable to retain efficacy for clustering of future data. The metric will be primarily the mortality class distribution as the silhouette scores, 
            and other clustering metrics are ineffective with this particular dataset.
            ''')
        st.subheader("Visuals:")
        st.write('''
            I use FAMD dimensionality reduction to visualize clusters. I chose FAMD because of the balance of categorical and quantitative variables in the dataset.
            ''')
        st.subheader("Results:")
        st.write('''
            The objective is to find what distinguishing markers make up the higher and lower mortality groups easily distinguished from the rest of the clusters. The ideal 
            number of groups (3-6) because of the need for a distinguishable range of mortality averages for each cluster showing a distinctive pattern across groups.
            ''')
    #=========================================================================
    if option == "Cluster Predict":
        st.subheader("Cluster Predictor")
        df = df.drop(columns=["Class", "kmed"])
        cols = df.columns.to_list()
        cols_list = []
        input = []
        for i in range(len(cols)):
            # cols[i]
            cols_list.append(cols[i])
            if cols[i] in yes_no:
                col = st.radio(cols[i], ["yes", "no"])
                if col == "yes":
                    col = 1
                else:
                    col = 0
                input.append(col)

            elif cols[i] in cat_cols:
                st.write(var_dict[cols[i]])
                col = st.slider(cols[i],
                                min_value=df[cols[i]].min(),
                                max_value=df[cols[i]].max())
                input.append(col)

            # for col in num_cols:
            elif cols[i] in num_cols:
                col = st.number_input(cols[i])
                input.append(col)
            else:
                col = st.radio(cols[i], ["male", "female"])
                if col == "male":
                    col = 1
                else:
                    col = 0
                input.append(col)
        # input
        # input = [
        # 1,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 0,
        # 26,
        # 13,
        # 0,
        # 1,
        # 2,
        # 1,
        # 1.15,
        # 165,
        # 12.5,
        # 94,
        # 950,
        # 73000,
        # 3.64,
        # 1.86,
        # 49,
        # 59,
        # 142,
        # 154,
        # 10.45,
        # 0.97,
        # 2,
        # 5.2,
        # 0.58,
        # 99,
        # 47,
        # 340
        # ]
        input_df = pd.DataFrame(input)
        input_df = input_df.T
        input_df.columns = cols_list
        # input_df

        # input
        X = input_df
        k = 5
        if st.button("Predict Cluster", False):
            # output = kmed_predict(df, input_df)
            # output.head(-1)
            # def kmed_predict(df, X, k=5):
            # generate k random indices from distance matrix
            # df = df.drop(columns="Class")
            # X = X.drop(columns="kmed")
            # X = pd.DataFrame(X, columns=df.columns)
            # X
            # df
            df = df.append(X, ignore_index=True)
            # df
            X_scaled = scale_df(df)
            dmat = create_dmat(X_scaled)

            np.random.seed(42)
            n_rows = dmat.shape[0]
            init_medoids = np.random.randint(0, n_rows, k)

            # init_medoids
            kmed = kmedoids(dmat,
                            initial_index_medoids=init_medoids,
                            data_type="distance_matrix")
            kmed.process()

            medoid_idxs = kmed.get_medoids()
            # medoid_idxs

            labels = kmed.predict(dmat)
            df["kmed"] = labels
            # print(df.kmed.value_counts())
            # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient()

            # casting kmed clusters to strings
            df.kmed = df.kmed.astype(str)

            # reordering cluster numbers by mortality rate
            df.loc[(df.kmed == "3"), "kmed"] = 4
            df.loc[(df.kmed == "0"), "kmed"] = 2
            df.loc[(df.kmed == "1"), "kmed"] = 3
            df.loc[(df.kmed == "4"), "kmed"] = 1
            df.loc[(df.kmed == "2"), "kmed"] = 0
            # clust = [2, 4, 0, 1, 3]
            # label = kmed.predict(X)
            # X['kmed'] = clust.index(label)
            group_df = df.groupby("kmed").mean().style.background_gradient()
            # counts = df.kmed.value_counts().index.sort_values(ascending=False)
            # group_df["count"] = counts
            output = df.iloc[165]
            st.write(f"Risk Cluster: {output['kmed']}")
            st.dataframe(output)

Example #13

0

Show file

def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'):
    '''
    :param data: DataFrame
    :param label: label column name in the data
    :param n_feats: the number of features be used to analysis.
    :param task: regression or classification
    :return:
    '''
    columns = data.columns.tolist()
    columns.remove(label)

    if id is not None:
        if columns[id].duplicated().sum():
            print('{} is duplicated !!!'.format(id))

        columns.remove(id)
        data.drop(id, axis=1, inplace=True)

    numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes]
    numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v]
    category_names = list(set(columns) - set(numeric_names))

    if task == 'classification':
        if len(category_names):
            # data distribution for each class
            new_data = data.dropna(axis=0)
            famd = prince.FAMD(
                n_components=2,
                n_iter=3,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=42
            )
            famd = famd.fit(new_data[columns])
            ax = famd.plot_row_coordinates(
                new_data,
                ax=None,
                x_component=0,
                y_component=1,
                labels=new_data.index,
                color_labels=['{}'.format(t) for t in new_data[label]],
                ellipse_outline=False,
                ellipse_fill=True,
                show_points=True
            )
            plt.show()
        else:
            new_data = data.dropna(axis=0)
            pca = PCA(n_components=2, random_state=seed)
            X_pca = pca.fit_transform(new_data[columns])
            sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data)
            plt.show()

    # sort features for correlation plot
    sorted_feat_name = numeric_names
    if len(numeric_names) > 6:
        n_clusters = 3
        new_data = data[[label] + numeric_names].dropna(axis=0)
        new_data_feat = new_data[numeric_names]
        new_data_stand = StandardScaler().fit_transform(new_data_feat)
        kmean_init = KMeans(n_clusters=n_clusters, random_state=seed)
        new_data_kmean=kmean_init.fit_transform(
            new_data_stand.reshape(len(numeric_names), -1))
        sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1])
        sorted_feat_name = [i[0] for i in sorted_feat]

    # correlation plot for all features
    sns.heatmap(data[[label] + sorted_feat_name + category_names].corr())
    plt.show()

    # outlier detection just for numeric features
    outlier = data[numeric_names].apply(mad_based_outlier)
    for i, column in enumerate(outlier.columns):
        print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]]))

    # missing value pattern plot for all features
    msno.matrix(data[columns[:n_feats]])
    plt.show()

    msno.bar(data[columns[:n_feats]])
    plt.show()

    miss_data = data[columns[:n_feats]].isnull().sum(axis=1)
    miss_data = miss_data.to_frame()
    miss_data.columns = ['number_of_missing_attributes']
    miss_data.sort_values('number_of_missing_attributes', inplace=True)
    miss_data['index'] = list(range(0, miss_data.shape[0]))
    sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data)
    plt.show()

Example #14

0

Show file

File: test_famd.py Project: yatishnaik27/prince

 def test_only_categorical(self):
     famd = prince.FAMD()
     X = self.X.select_dtypes(exclude=np.number)
     with self.assertRaises(ValueError):
         famd.fit(X)
         famd.transform(X)

Example #15

0

Show file

File: code.py Project: tatsiana-palikarpava/data-mining-and-machine-learning

dim_red = False
if dim_red:
    import prince
    # Here you can choose between PCA and FAMD
    pca = True
    if pca:
        # One-hot encoding
        dummies = pd.get_dummies(comb)
        pca = prince.PCA(n_components=50)
        pca = pca.fit(dummies)
        expl = (pca.explained_inertia_)
        cum = (np.cumsum(expl))[-1]
        print("Explained variance " + str(cum))
        dummies = pca.transform(dummies)
    else:
        famd = prince.FAMD(n_components=50)
        famd = famd.fit(comb)
        expl = (famd.explained_inertia_)
        cum = (np.cumsum(expl))[-1]
        print("Explained variance " + str(cum))
        comb = famd.transform(comb)
        # One-hot encoding
        dummies = pd.get_dummies(comb)

#%%
""" Split combined data """
if not dim_red:
    dummies = pd.get_dummies(comb)
X_train = dummies.iloc[:len(train0)]
#print(X_train.shape)
y_train = y

Example #16

0

Show file

File: test_famd.py Project: yatishnaik27/prince

 def test_fit_pandas_dataframe(self):
     famd = prince.FAMD()
     self.assertTrue(isinstance(famd.fit(self.X), prince.FAMD))

Example #17

0

Show file

File: test_famd.py Project: yatishnaik27/prince

 def test_only_numerical_numpy(self):
     famd = prince.FAMD()
     X = self.X.select_dtypes(np.number)
     with self.assertRaises(ValueError):
         famd.fit(X.to_numpy())
         famd.transform(X.to_numpy())

Example #18

0

Show file

mca.explained_inertia_
xlabels = ['{}'.format(t) for t in range(1, 16)]
plt.bar(xlabels, mca.explained_inertia_[:15])
plt.xlabel("Axe")
plt.title("Explained Inertia Ratio", fontsize=20)
"""
FAMD
"""
X = results2.iloc[:, 2:12]
X[['Type 1', 'Type 2', 'Generation',
   'Legendary']] = X[['Type 1', 'Type 2', 'Generation',
                      'Legendary']].astype('category')

famd = prince.FAMD(n_components=10,
                   n_iter=3,
                   copy=True,
                   check_input=True,
                   engine='auto',
                   random_state=42)
famd = famd.fit(X)

famd.row_coordinates(X)

ax = famd.plot_row_coordinates(
    X,
    ax=None,
    figsize=(6, 6),
    x_component=0,
    y_component=1,
    color_labels=['{}'.format(t) for t in results2["win"]],
    ellipse_outline=False,
    ellipse_fill=True,

Example #19

0

Show file

File: init_params.py Project: RobeeF/M1DGMM

def dim_reduce_init(y,
                    n_clusters,
                    k,
                    r,
                    nj,
                    var_distrib,
                    use_famd=False,
                    seed=None):
    ''' Perform dimension reduction into a continuous r dimensional space and determine 
    the init coefficients in that space
    
    y (numobs x p ndarray): The observations containing categorical variables
    n_clusters (int): The number of clusters to look for in the data
    k (1d array): The number of components of the latent Gaussian mixture layers
    r (int): The dimension of latent variables
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    var_distrib (p 1darray): An array containing the types of the variables in y 
    use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the 
                    first continuous latent variable. Otherwise MCA is used.
    seed (None): The random state seed to use for the dimension reduction
    ---------------------------------------------------------------------------------------
    returns (dict): All initialisation parameters
    '''

    L = len(k)
    numobs = len(y)
    S = np.prod(k)

    #==============================================================
    # Dimension reduction performed with MCA
    #==============================================================

    if type(y) != pd.core.frame.DataFrame:
        raise TypeError('y should be a dataframe for prince')

    if (np.array(var_distrib) == 'ordinal').all():
        print('PCA init')

        pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\
            rescale_with_std=True, copy=True, check_input=True, engine='auto',\
                random_state = seed)
        z1 = pca.fit_transform(y).values

    elif use_famd:
        famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \
                               engine='auto', random_state = seed)
        z1 = famd.fit_transform(y).values

    else:
        # Check input = False to remove
        mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\
                         check_input=False, engine='auto', random_state = seed)
        z1 = mca.fit_transform(y).values

    z = [z1]
    y = y.values

    #==============================================================
    # Set the shape parameters of each data type
    #==============================================================

    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\
                               var_distrib == 'binomial')].astype(int)
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\
                              var_distrib == 'binomial')]
    nb_bin = len(nj_bin)

    y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int)
    nj_ord = nj[var_distrib == 'ordinal']
    nb_ord = len(nj_ord)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical']
    nb_categ = len(nj_categ)

    # Set y_count standard error to 1
    y_cont = y[:, var_distrib == 'continuous']

    # Before was np.float
    y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True)
    nb_cont = y_cont.shape[1]

    #=======================================================
    # Determining the Gaussian Parameters
    #=======================================================
    init = {}

    eta = []
    H = []
    psi = []
    paths_pred = np.zeros((numobs, L))

    for l in range(L):
        params = get_MFA_params(z[l], k[l], r[l:])
        eta.append(params['eta'][..., n_axis])
        H.append(params['H'])
        psi.append(params['psi'])
        z.append(params['z_nextl'])
        paths_pred[:, l] = params['classes']

    paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0)
    paths, nb_paths = add_missing_paths(k, paths, nb_paths)

    w_s = nb_paths / numobs
    w_s = np.where(w_s == 0, 1E-16, w_s)

    # Check all paths have been explored
    if len(paths) != S:
        raise RuntimeError('Real path len is', S, 'while the initial number', \
                           'of path was only',  len(paths))

    w_s = w_s.reshape(*k).flatten('C')

    #=============================================================
    # Enforcing identifiability constraints over the first layer
    #=============================================================

    H = diagonal_cond(H, psi)
    Ez, AT = compute_z_moments(w_s, eta, H, psi)
    eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

    init['eta'] = eta
    init['H'] = H
    init['psi'] = psi

    init['w_s'] = w_s  # Probabilities of each path through the network
    init['z'] = z

    # The clustering layer is the one used to perform the clustering
    # i.e. the layer l such that k[l] == n_clusters
    clustering_layer = np.argmax(np.array(k) == n_clusters)

    init[
        'classes'] = paths_pred[:,
                                clustering_layer]  # 0 To change with clustering_layer_idx

    #=======================================================
    # Determining the coefficients of the GLLVM layer
    #=======================================================

    # Determining lambda_bin coefficients.

    lambda_bin = np.zeros((nb_bin, r[0] + 1))

    for j in range(nb_bin):
        Nj = np.max(y_bin[:, j])  # The support of the jth binomial is [1, Nj]

        if Nj == 1:  # If the variable is Bernoulli not binomial
            yj = y_bin[:, j]
            z_new = z[0]
        else:  # If not, need to convert Binomial output to Bernoulli output
            yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0])

        lr = LogisticRegression()

        if j < r[0] - 1:
            lr.fit(z_new[:, :j + 1], yj)
            lambda_bin[j, :j + 2] = np.concatenate(
                [lr.intercept_, lr.coef_[0]])
        else:
            lr.fit(z_new, yj)
            lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]])

    ## Identifiability of bin coefficients
    lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0]

    # Determining lambda_ord coefficients
    lambda_ord = []

    for j in range(nb_ord):
        Nj = len(np.unique(
            y_ord[:, j], axis=0))  # The support of the jth ordinal is [1, Nj]
        yj = y_ord[:, j]

        ol = OrderedLogit()
        ol.fit(z[0], yj)

        ## Identifiability of ordinal coefficients
        beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten()
        lambda_ord_j = np.concatenate([ol.alpha_, beta_j])
        lambda_ord.append(lambda_ord_j)

    # Determining the coefficients of the continuous variables
    lambda_cont = np.zeros((nb_cont, r[0] + 1))

    for j in range(nb_cont):
        yj = y_cont[:, j]
        linr = LinearRegression()

        if j < r[0] - 1:
            linr.fit(z[0][:, :j + 1], yj)
            lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_],
                                                     linr.coef_])
        else:
            linr.fit(z[0], yj)
            lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_])

    ## Identifiability of continuous coefficients
    lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0]

    # Determining lambda_categ coefficients
    lambda_categ = []

    for j in range(nb_categ):
        yj = y_categ[:, j]

        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(z[0], yj)

        ## Identifiability of categ coefficients
        beta_j = lr.coef_ @ AT[0][0]
        lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j]))

    init['lambda_bin'] = lambda_bin
    init['lambda_ord'] = lambda_ord
    init['lambda_cont'] = lambda_cont
    init['lambda_categ'] = lambda_categ

    return init

Example #20

0

Show file

File: _analysisMain_loan_default_analysis_and_prediction.py Project: SDiamand/sandbox

--------------------------------------------------------------------------------
    Dimenasionality reduction and visualize segmentation for bad/good clients
    Since data is still high in dimensions. We first reduce dimensionality using
    FAMD because data has continuous as well as categorical data.
--------------------------------------------------------------------------------
'''

# FAMD demands categorical variables to be identified as such
df_famd = X
df_famd['bad_clients'] = y.iloc[:,0]
df_famd['descrStatus'] = y.iloc[:,1]

for col in df_famd.select_dtypes(include=['uint8']).columns:
    df_famd[col] = df_famd[col].astype('category')

famd = prince.FAMD(n_components=5, n_iter=100, copy=True, check_input=False, engine='sklearn', random_state=1)
famd = famd.fit(df_famd.drop(columns=['bad_clients', 'descrStatus'], axis=1))
famd.explained_inertia_

# first graph decpits all descStatus categories for clients while the second graph compares bad clients
#(i.e. those who either never repaid the loan or paid it after more than 14 days) with the rest of the bad_clients
# Client type 1: bad clients
#
#from the FAMD analysis it's can be seen cluters cannot be reduced visually in a convenient way. This means that when
# choosing a model one must consider that clients won't segment as neatly as hoped
fig, ax = plt.subplots(1,2, figsize=(10,8))
famd.plot_row_coordinates(df_famd, x_component=0, y_component=1,
                               color_labels=['Client type {}'.format(t) for t in df_famd['descrStatus']],
                               ellipse_outline=False, ellipse_fill=True,
                               show_points=True, ax=ax[0])

Example #21

0

Show file

 def test_only_numerical(self):
     famd = prince.FAMD()
     X = self.X.select_dtypes(np.number)
     with self.assertRaises(ValueError):
         famd.fit(X)

Example #22

0

Show file

File: test_famd.py Project: yuanmengzhixing/prince

 def test_only_categorical(self):
     famd = prince.FAMD()
     X = self.X.select_dtypes(exclude=np.number)
     self.assertRaises(ValueError, lambda: famd.fit(X))