def dimension_reduction_famd(self, df): n_comp = df.shape[1] col = [] for i in range(n_comp): col.append("FAMD_" + str(i + 1)) if self.my_FAMD is None: self.my_FAMD = prince.FAMD(n_components=n_comp, n_iter=10, copy=True, check_input=True, engine='auto', random_state=42) self.my_FAMD = self.my_FAMD.fit(df.drop("Y", axis=1)) print(self.my_FAMD.explained_inertia_) else: print("NAOOOOOOOO") print(df.drop("Y", axis=1).shape) print(self.my_FAMD) aux = self.my_FAMD.transform(df.drop("Y", axis=1)) aux.columns = col aux = aux.join(df["Y"]) df = aux print("Depois FAMD:") print(df.head()) plt.figure(figsize=(12, 12)) plt.scatter(df[df["Y"] == 0]["FAMD_3"], df[df["Y"] == 0]["FAMD_4"], color='red', alpha=0.5, label='0') plt.scatter(df[df["Y"] == 1]["FAMD_3"], df[df["Y"] == 1]["FAMD_4"], color='blue', alpha=0.5, label='1') plt.title("FAMD") plt.ylabel('Les coordonnees de Y') plt.xlabel('Les coordonnees de X') plt.legend() plt.show() plt.figure(figsize=(12, 12)) sns.distplot(df[df["Y"] == 1]["FAMD_3"]) sns.distplot(df[df["Y"] == 0]["FAMD_3"]) plt.show() return df
def dimension_reduction(X, n_components): print("Applying FAMD...") X = prince.FAMD(n_components=n_components, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42).fit_transform(X) # ax = famd.plot_row_coordinates(X,ax=None,figsize=(6, 6),x_component=0, # y_component=1,ellipse_outline=True,ellipse_fill=True,show_points=False) # ax.get_figure().savefig('famd_row_coordinates.svg') print("Number of features after dimension reduction: " + str(len(X.columns))) return X
def dim_reduct(df): ''' INPUT: Dataframe OUTPUT: Dataframe after dimensionality reduction ''' famd = prince.FAMD(n_components=10, n_iter=10, copy=True, engine='auto', random_state=42) famd = famd.fit(df) print(sum(famd.explained_inertia_)) dim_red = famd.row_coordinates(df) return dim_red
def getFAMDData(X, dataType): # vals = package.Unprocessed X = createFAMDDataSets(X, dataType) # chartFAMD(X, vals[:, -1], dataType) if dataType == 'Adult': components = 6 else: components = 8 transformer = prince.FAMD(n_components=components, n_iter=3,copy=True,check_input=True,engine='auto', random_state=42) fitted = transformer.fit(X) transformed = transformer.transform(X) return transformed.values
def dim_reduct(df): ''' INPUT: Dataframe. OUTPUT: Dataframe with numerical columns scaled. ''' famd = prince.FAMD(n_components=10, n_iter=10, copy=True, engine='auto', random_state=42) famd = famd.fit(df) print(sum(famd.explained_inertia_)) dim_red = famd.row_coordinates(df) return dim_red
def FAMD(num_components): famd = prince.FAMD(n_components=num_components, n_iter=3, copy=True, check_input=True, engine='auto', random_state=0) df = DATA.fillna('None') df_reduced = df[CATEGORICAL_COLS_GOWER + NUMERICAL_COLS_GOWER] components = famd.fit(df_reduced).row_coordinates(df_reduced) components = components.rename( columns={i: "Component " + str(i + 1) for i in range(num_components)}) df = pd.concat([df, components], axis=1) return df
def preprocessing_famd(self): data = self.df #Number of entries and columns. entries = data.shape[0] features = data.shape[1] #Dropping columns where all the entery values are NaN. data.dropna(axis=1, how='all', inplace=True) data.drop(['status_id'], axis=1, inplace=True) #Finding out number and percentage of missing values. bools = data.isnull().values #Turning nested list into one list. bools_flaten = list(np.array(bools).flat) percentage_empty = float(bools_flaten.count(True)) / float( len(bools_flaten)) #Setting status_publish column into index. data = data.set_index('status_published') #FAMD famd = prince.FAMD(n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) famd = famd.fit(data) #Helps to see all columns. pd.set_option('display.max_columns', None) ax = famd.plot_row_coordinates(data, ax=None, figsize=(3, 3), x_component=0, y_component=1, labels=data.index, color_labels=[ 'status_type {}'.format(t) for t in data['status_type'] ], ellipse_outline=False, ellipse_fill=True, show_points=True) plt.show() return data.shape[0], data.shape[1], percentage_empty
def chartFAMD(X, y, dataType): title = '{0} FAMD'.format(dataType) componentRange = range(1, X.shape[1] - 1) icaValues = [] values = [] components = len(componentRange) transformer = prince.FAMD(n_components=components, n_iter=3,copy=True,check_input=True,engine='auto', random_state=42) fitted = transformer.fit(X) print(transformer.row_coordinates(X)) transformed = transformer.transform(X) cum = np.cumsum(transformer.explained_inertia_) fig, ax = plt.subplots() plt.title('FAMD - {0} data set'.format(dataType)) plt.legend(loc='best') ax.plot(componentRange, transformer.explained_inertia_, label='Variance') ax.set_ylabel('Eigenvalue') ax.set_xlabel('Components') ax2 = ax.twinx() ax2.plot(componentRange, cum, linestyle='--', label='Cumulative Variance', color='orange') ax2.set_ylabel('Explained Variance') fig.legend(loc="upper right", bbox_to_anchor=(1,1), bbox_transform=ax.transAxes) fig.tight_layout() # plt.show() plt.savefig('{0}{1}.png'.format(path, title)) plt.clf()
def Preprocess(data_frame, target = None, method = 'FAMD', samples = None, mapper = None, num_components = 3, scaler = None): # If no target supplied get as target the last column of df if not target: target = data_frame.columns.values.tolist()[-1] ''' See below for this, there is a problem for now with Dummy ''' if method == 'Dummy': print('Dummy is not functionning proberly.') #method = 'PCA' if method == 'FAMD': if not mapper: # Create FAMD mapper. ''' Consider passing **kwargs in Preprocess func. to pass in mappers. ''' mapper = pr.FAMD( n_components = num_components, n_iter=100, #rescale_with_mean = True, #rescale_with_std = True, copy=True, check_input=True, engine='auto', random_state=None ) if samples is not None: # Sample the data set, Split to training and testing sets. train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:] test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:] # Create taining labels. This will give us a one-hot encoding for each class. train_target = pd.get_dummies(train_data[target]).astype('float64') # Create testing labels test_target = pd.get_dummies(test_data[target]).astype('float64') # Drop the income column from data sets. train_data = train_data.drop(columns = [target]) test_data = test_data.drop(columns = [target]) # Get the vectors created for the training set famd_train = mapper.fit(train_data) vecs_train = pd.DataFrame(famd_train.row_coordinates(train_data)) # Vectors for testing set famd_test = mapper.fit(test_data) vecs_test = pd.DataFrame(famd_test.row_coordinates(test_data)) # Normalise vectors (We can use l1, l2 or max norms) vecs_train = pd.DataFrame(preprocessing.normalize(vecs_train, norm = 'l2', axis = 1), columns = vecs_train.columns) vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns) ''' Consider returning a single dictionary. Each case has different number of returned variables. Or make each case different method''' return vecs_train, train_target, vecs_test, test_target, mapper, target else: # If no samples are supplied we process the entire data set as one. test_data = data_frame.copy() test_target = pd.get_dummies(test_data[target]).astype('float64') # Drop the income column from data sets and get normalized vectors test_data = test_data.drop(columns = [target]) famd_test = mapper.fit(test_data) vecs_test = pd.DataFrame(famd_test.row_coordinates(test_data)) vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns) return vecs_test, test_target, mapper, target elif method == 'PCA': # PCA only works with numerical data. See below how we convert non numeric. if not mapper: mapper = pr.PCA( n_components = num_components, n_iter = 100, rescale_with_mean = True, rescale_with_std = True, copy = True, check_input = True, engine = 'auto', random_state = None ) # Get all labels first labels = pd.get_dummies(data_frame[target]).astype('float64') # Remove the target class del data_frame[target] # Convert the data set. Each class of each feature is now a dummy feature # with 1. if it was present in the entry or 0. if not. data_frame = pd.get_dummies(data_frame).astype('float64') if samples is not None: train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:] test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:] train_target = labels.loc[samples.iloc[:,:-1].values.flatten(),:] test_target = labels.loc[samples.iloc[:,-1].values.flatten(),:] pca_train = mapper.fit(train_data ) vecs_train = pd.DataFrame(pca_train.row_coordinates(train_data)) pca_test = mapper.fit(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) #vecs_train = pd.DataFrame(preprocessing.normalize(vecs_train, norm = 'l2', axis = 1), columns = vecs_train.columns) #vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns) return vecs_train, train_target, vecs_test, test_target, mapper, target else: test_data = data_frame.copy() test_target = labels pca_test = mapper.fit(test_data) vecs_test = pd.DataFrame(pca_test.row_coordinates(test_data)) #vecs_test = pd.DataFrame(preprocessing.normalize(vecs_test, norm = 'l2', axis = 1), columns = vecs_test.columns) return vecs_test, test_target, mapper, target elif method == 'Dummy': labels = pd.get_dummies(data_frame[target]).astype('float64') del data_frame[target] data_frame = pd.get_dummies(data_frame).astype('float64') if samples is not None: train_data = data_frame.loc[samples.iloc[:,:-1].values.flatten(),:] test_data = data_frame.loc[samples.iloc[:,-1].values.flatten(),:] train_target = labels.loc[samples.iloc[:,:-1].values.flatten(),:] test_target = labels.loc[samples.iloc[:,-1].values.flatten(),:] if not scaler: scaler = preprocessing.StandardScaler() vecs_train = pd.DataFrame(scaler.fit_transform(train_data), columns = train_data.columns) vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns) else: ''' !!! NOTE: Need to fix. If scaler is supplied but fitted to different dimensional data, it cannot be used and returns error. If test data do not contain a classes of any feature or contains new classes, the dimensions of the dummy data frame will be different. Also this issue will create problems with Tensorflow's placeholders. ''' try: vecs_train = pd.DataFrame(scaler.transform(train_data), columns = train_data.columns) vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns) except: vecs_train = pd.DataFrame(scaler.fit_transform(train_data), columns = train_data.columns) vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns) return vecs_train, train_target, vecs_test, test_target, scaler, target else: test_data = data_frame.copy() test_target = labels if not scaler: scaler = preprocessing.StandardScaler() vecs_test = pd.DataFrame(scaler.fit_transform(test_data), columns = train_data.columns) else: try: vecs_test = pd.DataFrame(scaler.transform(test_data), columns = test_data.columns) except: vecs_test = pd.DataFrame(scaler.fit_transform(test_data), columns = test_data.columns) return vecs_test, test_target, scaler, target
id_cluster_df.head() # + hidden=true id_dummies_df = pd.get_dummies(id_cluster_df) id_dummies_df.head() # + hidden=true id_dummies_df.shape # + hidden=true import prince famd = prince.FAMD( n_components=39, n_iter=10, copy=True, check_input=True, engine='auto', ## Can be "auto", 'sklearn', 'fbpca' random_state=42) ## Fit FAMD object to data famd = famd.fit(id_cluster_df) ## Exclude target variable "Churn" famd_data = famd.row_coordinates(id_cluster_df) # + hidden=true np.sum(famd.explained_inertia_) # + hidden=true # scaler = StandardScaler() # scaled_data = scaler.fit_transform(id_dummies_df)
test_data = test_data.drop(columns=['income']) to_encode = data_frame['income'].copy() labels, fetures = encode_categorical_S(to_encode) train_target = pd.DataFrame(labels.loc[train_data.index.values], columns=['income']) test_target = pd.DataFrame(labels.loc[test_data.index.values], columns=['income']) #%% # Use FAMD (Factor Analysis for Mixed Data), to reduse the dimensions of the data set # and convert categorical data to numeric form. ''' Rename test to valid''' famd = pr.FAMD(n_components=5, n_iter=10, copy=True, check_input=True, engine='auto', random_state=None) famd_train = famd.fit(train_data) vecs_train = famd_train.row_coordinates(train_data) famd_test = famd.fit(test_data) vecs_test = famd_test.row_coordinates(test_data) scaler = preprocessing.StandardScaler() vecs_train = pd.DataFrame(scaler.fit_transform(vecs_train), columns=vecs_train.columns) vecs_test = pd.DataFrame(scaler.transform(vecs_test), columns=vecs_test.columns) #%% Model
def main(): st.title("Hepatocellular Carcinoma Clustering Model") url = "https://raw.githubusercontent.com/taylor-m/hcc_clustering/main/hcc_data/hcc-data.csv" raw = pd.read_csv(url) cols = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Age", "Grams_day", "Packs_year", "PS", "Encephalopathy", "Ascites", "INR", "AFP", "Hemoglobin", "MCV", "Leucocytes", "Platelets", "Albumin", "Total_Bil", "ALT", "AST", "GGT", "ALP", "TP", "Creatinine", "Nodule", "Major_Dim", "Dir_Bil", "Iron", "Sat", "Ferritin", "Class", ] st.set_option('deprecation.showPyplotGlobalUse', False) #========================================================================= # DATA FUNCTIONS def load_data(): df = pd.read_csv(url, names=cols) # changing the ? input values to np.nan for the imputer df[df == "?"] = np.nan # using the KNN imputer to impute the missing values imputer = KNNImputer(missing_values=np.nan) imputed = imputer.fit_transform(df) # creating a new df from the imputed array df = pd.DataFrame(imputed, columns=cols) cats = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Class", "PS", "Encephalopathy", "Ascites", "Nodule", ] # rounding all the values in cat columns because imputed values weren't binary for cat in cats: df[cat] = round(df[cat]).astype(int) return df def scale_df(df): # scale data for cluster scaler = StandardScaler() # looking at df with target var first X_scaled = scaler.fit_transform(df) X_scaled = pd.DataFrame(X_scaled, columns=df.columns, index=df.index) return X_scaled def create_dmat(scaled): # distance matrix dist = pdist(scaled, metric="cosine") dmat = squareform(dist) return dmat def kmed_cluster(df, k): # generate k random indices from distance matrix df2 = df.drop(columns="Class") X_scaled = scale_df(df2) dmat = create_dmat(X_scaled) np.random.seed(42) n_rows = dmat.shape[0] init_medoids = np.random.randint(0, n_rows, k) # init_medoids kmed = kmedoids(dmat, initial_index_medoids=init_medoids, data_type="distance_matrix") kmed.process() clusters = kmed.get_clusters() medoid_idxs = kmed.get_medoids() # medoid_idxs labels = kmed.predict(dmat) df["kmed"] = labels # print(df.kmed.value_counts()) # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient() # casting kmed clusters to strings df.kmed = df.kmed.astype(str) # reordering cluster numbers by mortality rate df.loc[(df.kmed == "3"), "kmed"] = 4 df.loc[(df.kmed == "0"), "kmed"] = 2 df.loc[(df.kmed == "1"), "kmed"] = 3 df.loc[(df.kmed == "4"), "kmed"] = 1 df.loc[(df.kmed == "2"), "kmed"] = 0 group_df = df.groupby("kmed").mean().style.background_gradient() # counts = df.kmed.value_counts().index.sort_values(ascending=False) # group_df["count"] = counts return df, group_df, clusters, dmat #========================================================================= # load data and impute missing values df = load_data() # run K-medoid cluster model w/ K = 5 df, group_df, clusters, dmat = kmed_cluster(df, 5) st.sidebar.title("Model") #========================================================================= cols = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Age", "Grams_day", "Packs_year", "PS", "Encephalopathy", "Ascites", "INR", "AFP", "Hemoglobin", "MCV", "Leucocytes", "Platelets", "Albumin", "Total_Bil", "ALT", "AST", "GGT", "ALP", "TP", "Creatinine", "Nodule", "Major_Dim", "Dir_Bil", "Iron", "Sat", "Ferritin", "Class", ] bin_cols = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Class", ] yes_no = [ "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", ] num_cols = [ "Age", "Grams_day", "Packs_year", "INR", "AFP", "Hemoglobin", "MCV", "Leucocytes", "Platelets", "Albumin", "Total_Bil", "ALT", "AST", "GGT", "ALP", "TP", "Creatinine", "Major_Dim", "Dir_Bil", "Iron", "Sat", "Ferritin", ] cat_cols = ["PS", "Encephalopathy", "Ascites", "Nodule"] #========================================================================= analysis_vars = [ "Iron", "Ferritin", "Dir_Bil", "Nodule", "GGT", "ALP", "Total_Bil", "Albumin", "Platelets", "Hemoglobin", "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Age", "Grams_day", "Packs_year", "PS", "Encephalopathy", "Ascites", "INR", "AFP", "MCV", "Leucocytes", "ALT", "AST", "TP", "Creatinine", "Major_Dim", "Sat", "Class", ] #========================================================================= num_vars = [ "Age", "Grams_day", "Packs_year", "INR", "AFP", "Hemoglobin", "MCV", "Leucocytes", "Platelets", "Albumin", "Total_Bil", "ALT", "AST", "GGT", "ALP", "TP", "Creatinine", "Major_Dim", "Dir_Bil", "Iron", "Sat", "Ferritin", "Nodule", "PS", "Encephalopathy", "Ascites", ] #========================================================================= cat_vars = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Class", ] #========================================================================= var_dict = { "Gender": ["0=female|1=male"], "HBsAg": [ "Hepatitis B surface Antigen\nHBsAg is the surface antigen of the hepatitis B virus. It indicates current hepatitis B infection.[10] An antigen is a protein that stimulates an immune system response, causing your body to produce antibodies to fight invaders. In hepatitis B, it's common to test for the hepatitis B surface antigen (HBsAg) and hepatitis B core antigen (HBcAg). These antigens are attached to the inside and the outside of the virus.[11]" ], "HBeAg": [ "Hepatitis B e Antigen\nHBeAg stands for hepatitis B e-antigen. This antigen is a protein from the hepatitis B virus that circulates in infected blood when the virus is actively replicating. The presence of HBeAg suggests that the person is infectious and is able to spread the virus to other people.[9]" ], "HBcAb": [ "Hep B core Antibody\nTotal hepatitis B core antibody (anti-HBc): Appears at the onset of symptoms in acute hepatitis B and persists for life. The presence of anti-HBc indicates previous or ongoing infection with hepatitis B virus in an undefined time frame. IgM antibody to hepatitis B core antigen (IgM anti-HBc)[8]" ], "HCVAb": [ "Hep C Virus Antibody\nWhat is hepatitis C virus antibody? A reactive or positive antibody test means you have been infected with the hepatitis C virus at some point in time. Once people have been infected, they will always have antibodies in their blood. This is true if they have cleared the virus, have been cured, or still have the virus in their blood.[7]" ], "Endemic": ["Endemic Countries"], "Hemochro": [ "Hemochromatosis\nHemochromatosis is the abnormal accumulation of iron in parenchymal organs, leading to organ toxicity. It is the most common autosomal recessive genetic disorder and the most common cause of severe iron overload.\nClinical manifestations of hemochromatosis include the following:\n\tLiver disease (hepatomegaly, 13%; cirrhosis, 13%, usually late in the disease)[6]" ], "AHT": ["Arterial Hypertension"], "CRI": [ "Chronic Renal Insufficiency\nRenal insufficiency is poor function of the kidneys that may be due to a reduction in blood-flow to the kidneys caused by renal artery disease." ], "HIV": ["Human Immunodeficiency Virus"], "NASH": [ "Nonalcoholic Steatohepatitis\nThe term nonalcoholic steatohepatitis (NASH) was first coined by Dr. Ludwig 3 decades ago to describe a unique entity characterized by fatty changes with lobular hepatitis in the absence of a history of alcoholism\nAt present, nonalcoholic fatty liver disease (NAFLD) has become the most common liver disease in the United States and, indeed, worldwide." ], "Varices": [ "Esophageal Varices\nEsophageal varices are enlarged veins in the lower esophagus. They're often due to obstructed blood flow through the portal vein, which carries blood from the intestine and spleen to the liver. Esophageal varices are abnormal, enlarged veins in the tube that connects the throat and stomach (esophagus)[5]" ], "Spleno": [ "Splenomegaly\nSplenomegaly is a condition that occurs when your spleen becomes enlarged. It's also commonly referred to as enlarged spleen or spleen enlargement. The spleen is a part of your lymphatic system. It helps the immune system by storing white blood cells and helping in the creation of antibodies.[3]\nMany conditions — including infections, liver disease and some cancers — can cause an enlarged spleen.[4]" ], "PHT": [ "Portal Hypertension\nPortal hypertension is an increase in the pressure within the portal vein, which carries blood from the digestive organs to the liver. The most common cause is cirrhosis of the liver, but thrombosis (clotting) might also be the cause.[2]" ], "PVT": [ "Portal Vein Thrombosis\nPortal vein thrombosis is blockage or narrowing of the portal vein (the blood vessel that brings blood to the liver from the intestines) by a blood clot.\nFluid accumulation in the abdomen (called ascites) is not common. But it may develop when people also have liver congestion (backup of blood in the liver) or liver damage, such as severe scarring of the liver (cirrhosis), or when large amounts of fluids are given intravenously to treat massive bleeding from ruptured varicose veins in the esophagus or stomach. If portal vein thrombosis develops in people with cirrhosis, their condition deteriorates.[1]" ], "Metastasis": ["Liver Metastasis"], "Hallmark": ["Radiological Hallmark"], "Age": ["Age at diagnosis"], "Grams_day": ["Grams of Alcohol per day"], "Packs_year": ["Packs of cigarets per year"], "PS": [ "Performance Status:\n0=Active\n1=Restricted\n2=Ambulatory\n3=Selfcare\n4=Disabled\n5=Dead" ], "Encephalopathy": ["Encephalopathy degree:\n1=None\n2=Grade I/II\n3=Grade III/IV"], "Ascites": ["Ascites degree:\n1=None\n2=Mild\n3=Moderate to Severe"], "INR": [ "International Normalised Ratio. This blood test looks to see how well your blood clots.\n\nThe international normalized ratio (INR) is a standardized number that's figured out in the lab. If you take blood thinners, also called anti-clotting medicines or anticoagulants, it may be important to check your INR. The INR is found using the results of the prothrombin time (PT) test. This measures the time it takes for your blood to clot. The INR is an international standard for the PT.", ], "AFP": [ "Alpha-Fetoprotein; An AFP tumor marker test is a blood test that measures the levels of AFP in adults. Tumor markers are substances made by cancer cells or by normal cells in response to cancer in the body. High levels of AFP can be a sign of liver cancer or cancer of the ovaries or testicles, as well as noncancerous liver diseases such as cirrhosis and hepatitis.\n\nHigh AFP levels don't always mean cancer, and normal levels don't always rule out cancer. So an AFP tumor marker test is not usually used by itself to screen for or diagnose cancer. But it can help diagnose cancer when used with other tests. The test may also be used to help monitor the effectiveness of cancer treatment and to see if cancer has returned after you've finished treatment.", ], "Hemoglobin": [ "Hemoglobin is the protein molecule in red blood cells that carries oxygen from the lungs to the body's tissues and returns carbon dioxide from the tissues back to the lungs. Higher than normal hemoglobin levels can be seen in people living at high altitudes and in people who smoke and infrequently with certain tumors.", ], "MCV": [ "Mean Corpuscular Volume; An MCV blood test measures the average size of your red blood cells. Larger than normal RBCs may indicate liver disease.", ], "Leucocytes": [ "white blood cells", ], "Platelets": [ "Platelets, or thrombocytes, are small, colorless cell fragments in our blood that form clots and stop or prevent bleeding.", ], "Albumin": [ "Albumin is a protein made by your liver. Albumin helps keep fluid in your bloodstream so it doesn't leak into other tissues. It is also carries various substances throughout your body, including hormones, vitamins, and enzymes. Low albumin levels can indicate a problem with your liver or kidneys.", ], "Total_Bil": [ "Total Bilirubin; This is a blood test that measures the amount of a substance called bilirubin. This test is used to find out how well your liver is working. It is often part of a panel of tests that measure liver function. A small amount of bilirubin in your blood is normal, but a high level may be a sign of liver disease.", ], "ALT": [ "Alanine aminotransferase (ALT) is an enzyme found mostly in the cells of the liver and kidney. Much smaller amounts of it are also found in the heart and muscles. Normally, ALT levels in blood are low, but when the liver is damaged, ALT is released into the blood and the level increases.", ], "AST": [ "Aspartate aminotransferase (AST) is an enzyme found in cells throughout the body but mostly in the heart and liver and, to a lesser extent, in the kidneys and muscles. In healthy individuals, levels of AST in the blood are low. When liver or muscle cells are injured, they release AST into the blood.", ], "GGT": [ "Gamma glutamyl transferase (GGT) is an enzyme found in cell membranes of many tissues mainly in the liver, kidney, and pancreas. [1] It is also found in other tissues including intestine, spleen, heart, brain, and seminal vesicles. The highest concentration is in the kidney, but the liver is considered the source of normal enzyme activity.", ], "ALP": [ "Alkaline phosphatase; The alkaline phosphatase test (ALP) is used to help detect liver disease or bone disorders. It is often ordered along with other tests, such as a gamma-glutamyl transferase (GGT) test and/or as part of a liver panel. In conditions affecting the liver, damaged liver cells release increased amounts of ALP into the blood.", ], "TP": [ "The total protein test measures the total amount of two classes of proteins found in the fluid portion of your blood. These are albumin and globulin. Proteins are important parts of all cells and tissues. Albumin helps prevent fluid from leaking out of blood vessels. Low levels can be indicative of liver disease.", ], "Creatinine": [ "Creatinine is critically important in assessing renal function because it has several interesting properties. In blood, it is a marker of glomerular filtration rate;", ], "Nodule": [ "Number of Nodules", ], "Major_Dim": [ "Major dimension of nodule", ], "Dir_Bil": [ "Direct Bilirubin; Bilirubin is a tetrapyrrole and a breakdown product of heme catabolism. Most bilirubin (70%-90%) is derived from hemoglobin degradation and, to a lesser extent, from other hemo proteins. In the serum, bilirubin is usually measured as both direct bilirubin (DBil) and total-value bilirubin", ], "Iron": [ "The amount of circulating iron bound to transferrin is reflected by the serum iron level.", ], "Sat": [ "Oxygen Saturation", ], "Ferritin": [ "Ferritin is the cellular storage protein for iron. It is present in small concentrations in blood, and the serum ferritin concentration normally correlates well with total-body iron stores, making its measurement important in the diagnosis of disorders of iron metabolism.", ], "Class": ["1=lives\n0=dies\n@ 1 year"], } #========================================================================= plot_dict = { "Gender": "0=female|1=male", "HBsAg": "Hepatitis B surface Antigen", "HBeAg": "Hepatitis B e Antigen", "HBcAb": "Hep B core Antibody", "HCVAb": "Hep C Virus Antibody", "Endemic": "Endemic Countries", "Hemochro": "Hemochromatosis", "AHT": "Arterial Hypertension", "CRI": "Chronic Renal Insufficiency", "HIV": "Human Immunodeficiency Virus", "NASH": "Nonalcoholic Steatohepatitis", "Varices": "Esophageal Varices", "Spleno": "Splenomegaly", "PHT": "Portal Hypertension", "PVT": "Portal Vein Thrombosis", "Metastasis": "Liver Metastasis", "Hallmark": "Radiological Hallmark", "Age": "Age at diagnosis", "Grams_day": "Grams of Alcohol per day", "Packs_year": "Packs of cigarets per year", "PS": "Performance Status:\n0=Active\n1=Restricted\n2=Ambulatory\n3=Selfcare\n4=Disabled\n5=Dead", "Encephalopathy": "Encephalopathy degree:\n1=None\n2=Grade I/II\n3=Grade III/IV", "Ascites": "Ascites degree:\n1=None\n2=Mild\n3=Moderate to Severe", "INR": "International Normalised Ratio", "AFP": "Alpha-Fetoprotein (ng/mL)", "Hemoglobin": "(g/dL)", "MCV": "Mean Corpuscular Volume (fl)", "Leukocytes": "(G/L)", "Platelets": "(G/L)", "Albumin": "(mg/dL)", "Total_Bil": "Total Bilirubin (mg/dL)", "ALT": "Alanine transaminase (U/L)", "AST": "Aspartate transaminase (U/L)", "GGT": "Gamma glutamyl transferase (U/L)", "ALP": "Alkaline phosphatase (U/L)", "TP": "Total Proteins (g/dL)", "Creatinine": "(mg/dL)", "Nodules": "Number of Nodules", "Major Dim": "Major dimension of nodule (cm)", "Dir Bil": "Direct Bilirubin (mg/dL)", "Iron": "(mcg/dL)", "Sat": "Oxygen Saturation (%)", "Ferritin": "(ng/mL)", "Class": "1=lives\n0=dies\n@ 1 year", } #========================================================================= cats = [ "Gender", "Symptoms", "Alcohol", "HBsAg", "HBeAg", "HBcAb", "HCVAb", "Cirrhosis", "Endemic", "Smoking", "Diabetes", "Obesity", "Hemochro", "AHT", "CRI", "HIV", "NASH", "Varices", "Spleno", "PHT", "PVT", "Metastasis", "Hallmark", "Class", "PS", "Encephalopathy", "Ascites", "Nodule", ] #========================================================================= lab_values = { "INR": [ 0, 1.1, "", ], "AFP": [ 0, 10, "ng/mL", ], "Hemoglobin": [ 12, 18, "g/dL", ], "MCV": [ 80, 100, "fl", ], "Leucocytes": [ 4, 11, "G/L", ], "Platelets": [ 150000, 450000, "G/L", ], "Albumin": [ 3.4, 5.4, "mg/dL", ], "Total_Bil": [ 0, 1, "mg/dL", ], "ALT": [ 29, 33, "U/L", ], "AST": [ 0, 35, "U/L", ], "GGT": [ 5, 40, "U/L", ], "ALP": [ 44, 147, "U/L", ], "TP": [ 6, 8.3, "g/dL", ], "Creatinine": [ 0.5, 1.2, "mg/dL", ], "Dir_Bil": [ 0.1, 0.3, "mg/dL", ], "Iron": [ 60, 180, "mcg/dL", ], "Sat": [ 95, 100, "%", ], "Ferritin": [ 10, 300, "ng/mL", ], } #========================================================================= # CLUSTERING FUNCTIONS def kmed_predict(df, X, k=5): # generate k random indices from distance matrix # df = df.drop(columns="Class") # X = X.drop(columns="kmed") # X = pd.DataFrame(X, columns=df.columns) df = df.append(X) X_scaled = scale_df(df) dmat = create_dmat(X_scaled) np.random.seed(42) n_rows = dmat.shape[0] init_medoids = np.random.randint(0, n_rows, k) # init_medoids kmed = kmedoids(dmat, initial_index_medoids=init_medoids, data_type="distance_matrix") kmed.process() medoid_idxs = kmed.get_medoids() # medoid_idxs labels = kmed.predict(dmat) df["kmed"] = labels # print(df.kmed.value_counts()) # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient() # casting kmed clusters to strings df.kmed = df.kmed.astype(str) # reordering cluster numbers by mortality rate df.loc[(df.kmed == "3"), "kmed"] = 4 df.loc[(df.kmed == "0"), "kmed"] = 2 df.loc[(df.kmed == "1"), "kmed"] = 3 df.loc[(df.kmed == "4"), "kmed"] = 1 df.loc[(df.kmed == "2"), "kmed"] = 0 group_df = df.groupby("kmed").mean().style.background_gradient() # counts = df.kmed.value_counts().index.sort_values(ascending=False) # group_df["count"] = counts output = df.head(-1) return df # PLOTTING FUNCTIONS def plot_boxplot(var): fig = go.Figure() # for i in range(df.kmed.unique()): fig.add_trace( go.Box( y=df[var], x=df.kmed, boxpoints=False, # no data points # marker_color='rgb(9,56,125)', # line_color='rgb(9,56,125)' )) # add min and max range lines for lab values if var in lab_values.keys(): fig.update_layout( title=f"{var} Values of Risk Clusters", xaxis_title="Risk Clusters", yaxis_title=f"{var} values ({lab_values[var][2]})") fig.add_hrect(y0=lab_values[var][0], y1=lab_values[var][1], line_width=0, fillcolor="green", opacity=0.2) else: fig.update_layout(title=f"{var} Values of Risk Clusters", xaxis_title="Risk Clusters", yaxis_title=f"{var}") st.plotly_chart(fig) def plot_violin(var): fig = go.Figure() fig.add_trace(go.Violin( y=df[var], x=df.kmed, )) fig.update_traces(meanline_visible=True) # add min and max range lines for lab values if var in lab_values.keys(): fig.update_layout( title=f"{var} Values of Risk Clusters", xaxis_title="Risk Clusters", yaxis_title=f"{var} values ({lab_values[var][2]})") fig.add_hrect(y0=lab_values[var][0], y1=lab_values[var][1], line_width=0, fillcolor="green", opacity=0.2) else: fig.update_layout(title=f"{var} Values of Risk Clusters", xaxis_title="Risk Clusters", yaxis_title=f"{var}") st.plotly_chart(fig) def plot_barplot(var): fig = go.Figure(data=[]) if var in lab_values.keys() or var in cat_cols: for val in df[var].unique(): fig.add_trace( go.Bar(name=f"{var} = {val}", x=df.kmed, y=df[df[var] == val][var])) fig.update_layout(barmode='stack') elif var == "Gender": fig.add_trace( go.Bar(name="female", x=df.kmed, y=(df["Gender"] == 1))) fig.add_trace(go.Bar(name="male", x=df.kmed, y=(df["Gender"] == 0))) else: fig.add_trace(go.Bar(name="No", x=df.kmed, y=(df[var] == 1))) fig.add_trace(go.Bar(name="Yes", x=df.kmed, y=(df[var] == 0))) fig.update_layout(title=f"{var} by Risk Cluster", xaxis_title="Risk Cluster", yaxis_title=f"{var}") st.plotly_chart(fig) def plot_hist(var, cluster_num): cluster_df = df[df.kmed == cluster_num] fig = px.histogram(cluster_df, x=var) st.plotly_chart(fig) #========================================================================= option = st.sidebar.selectbox( "Model Options", ("Objective", "Data", "Cluster Analysis", "Cluster Predict", "Source")) #========================================================================= if option == "Data": st.subheader("Dataset") if st.sidebar.checkbox("full data", False): st.write(df) else: st.write(df.head(10)) st.write(f"Number of samples: {df.shape[0]}") # st.write() st.subheader("Variables") st.write(""" \n1. Gender \n\t\t0=female|1=male \n2. Symptoms \n3. Alcohol \n4. HBsAg - Hep B surface Antigen \n5. HBeAg - Hep B e Antigen \n6. HBcAb - Hep B core Antibody \n7. HCVAb - Hep C Virus Antibody \n8. Cirrhosis \n9. Endemic Countries \n10. Smoking \n11. Diabetes \n12. Obesity \n13. Hemochromatosis \n14. AHT - Arterial Hypertension \n15. CRI - Chronic Renal Insufficiency \n16. HIV - Human Immunodeficiency Virus \n17. NASH - Nonalcoholic Steatohepatitis \n18. Varices - Esophageal Varices \n19. Spleno - Splenomegaly \n20. PHT - Portal Hypertension \n21. PVT - Portal Vein Thrombosis \n22. Metastasis - Liver Metastasis \n23. Hallmark - Radiological Hallmark \n24. Age - Age at diagnosis \n25. Grams/day - Grams of Alcohol per day \n26. Packs/year - Packs of cigarets per year \n27. PS - Performance Status \n\t\t[0=Active;1=Restricted;2=Ambulatory;3=Selfcare;4=Disabled;5=Dead] \n28. Encephalopathy - Encephalopathy degree \n\t\t[1=None;2=Grade I/II; 3=Grade III/IV] \n29. Ascites - Ascites degree \n\t\t[1=None;2=Mild;3=Moderate to Severe] \n30. INR - International Normalised Ratio \n31. AFP - Alpha-Fetoprotein (ng/mL) \n32. Hemoglobin (g/dL) \n33. MCV - Mean Corpuscular Volume (fl) \n34. Leukocytes(G/L) \n35. Platelets (G/L) \n36. Albumin (mg/dL) \n37. Total Bilirubin(mg/dL) \n38. ALT - Alanine transaminase (U/L) \n39. AST - Aspartate transaminase (U/L) \n40. GGT - Gamma glutamyl transferase (U/L) \n41. ALP - Alkaline phosphatase (U/L) \n42. TP - Total Proteins (g/dL) \n43. Creatinine (mg/dL) \n44. Nodules - Number of Nodules \n45. Major Dim - Major dimension of nodule (cm) \n46. Dir Bil - Direct Bilirubin (mg/dL) \n47. Iron (mcg/dL) \n48. Sat - Oxygen Saturation (%) \n49. Ferritin (ng/mL) \n50. Class (1=lives;0=dies) @ 1 year """) #========================================================================= if option == "Source": st.write(""" Data Set Name: \nHepatocellular Carcinoma Dataset (HCC dataset) \n\nAbstract: \nHepatocellular Carcinoma dataset (HCC dataset) was collected at a University Hospital in Portugal. It contains real clinical data of 165 patients diagnosed with HCC. \n\nDonors: \nMiriam Seoane Santos ([email protected]) and Pedro Henriques Abreu ([email protected]), Department of Informatics Engineering, Faculty of Sciences and Technology, University of Coimbra . Armando Carvalho ([email protected]) and Adélia Simão ([email protected]), Internal Medicine Service, Hospital and University Centre of Coimbra \n\nData Type: Multivariate \nTask: Classification, Regression, Clustering, Casual Discovery \nAttribute Type: Categorical, Integer and Real \n\nArea: Life Sciences \n\nFormat Type: Matrix \n\nMissing values: Yes \n\nInstances and Attributes: \nNumber of Instances (records in your data set): 165 \nNumber of attributes (fields within each record): 49 \n\nRelevant Information: \nHCC dataset was obtained at a University Hospital in Portugal and contais several demographic, risk factors, laboratory and overall survival features of 165 real patients diagnosed with HCC. The dataset contains 49 features selected according to the EASL-EORTC (European Association for the Study of the Liver - European Organisation for Research and Treatment of Cancer) Clinical Practice Guidelines, which are the current state-of-the-art on the management of HCC. \n\nThis is an heterogeneous dataset, with 23 quantitative variables, and 26 qualitative variables. Overall, missing data represents 10.22% of the whole dataset and only eight patients have complete information in all fields (4.85%). The target variables is the survival at 1 year, and was encoded as a binary variable: 0 (dies) and 1 (lives). A certain degree of class-imbalance is also present (63 cases labeled as “dies” and 102 as “lives”). \n\nA detailed description of the HCC dataset (feature’s type/scale, range, mean/mode and missing data percentages) is provided in Santos et al. “A new cluster-based oversampling method for improving survival prediction of hepatocellular carcinoma patients”, Journal of biomedical informatics, 58, 49-59, 2015. """) #========================================================================= analysis_dict = { "Ferritin": "- ~12-112% higher mean levels", "Iron": "- significantly higher iron level distribution in high risk cluster\n- lower iron level distributions in medium risk clusters", "Dir_Bil": "- highest levels in samples in risk cluster 4\n- cluster 3 shows slightly higher levels\n- clusters 0-2 have low distributions", "Major_Dim": "indistinct", "Gender": "", "HBsAg": "", "HBeAg": "", "HBcAb": "- the highest risk group has a higher ratio of samples without the HBcAb than the others", "HCVAb": "- the highest risk group has a higher ratio of samples without the HBcAb than the others", "Endemic": "", "Hemochro": "", "AHT": "", "CRI": "", "HIV": "", "NASH": "", "Varices": "", "Spleno": "", "PHT": "", "PVT": "", "Metastasis": "", "Hallmark": "", "Age": "", "Grams_day": "", "Packs_year": "", "PS": "", "Encephalopathy": "risk clusters 0,1 have even distributions throughout; 2-4 heavy grouping around 1", "Ascites": "risk clusters 0,1 have even distributions throughout; 2-4 heavy grouping around 1", "INR": "risk cluster 4 has a distribution between the others; 2,3 being lower, 0,1 being higher", "AFP": "cluster 4, 2 have different distributions from other clusters", "Hemoglobin": "- elevated in high risk groups", "MCV": "inconclusive", "Leucocytes": "- same relative pattern as platelet var", "Platelets": "- clusters 0,3 are low/high risk clusters respectively yet have similar platelet levels\n- cluster 2 has highest platelet levels", "Albumin": "- elevated levels in high risk groups", "Total_Bil": "- risk clusters 2-4 all show bottom heavy total bilirubin levels\n- clusters 2 has similar levels yet is a lower risk group", "ALT": "inconclusive", "AST": "inconclusive", "GGT": "- lower risk groups have bottom heavy distributions", "ALP": "- higher risk groups have higher median levels & generally higher distributions", "TP": "inconclusive", "Creatinine": "indistinct", "Nodule": "- higher risk cluster, bottom heavy distribution", } #========================================================================= interest_vars = [ "Ferritin", "Dir_Bil", "GGT", "ALP", "HBcAb", "HCVAb", "Smoking", "AHT", "Metastasis", ] #========================================================================= if option == "Cluster Analysis": st.subheader("Cluster Analysis") # st.write(f"Data Class Mean: {df.Class.mean()}") # plot_type = st.sidebar.radio("Plot Type", ["Boxplot", "Violin"]) # options for overall chart options st.sidebar.subheader("Chart Options:") if st.sidebar.checkbox("View Dataframe", False): st.write(df) if st.sidebar.checkbox("Gradient", False): group_df if st.sidebar.checkbox("Counts", False): st.write(df.kmed.value_counts()) if st.sidebar.checkbox("Plot Clusters", False): df_copy = df.copy() for cat in cats: df_copy[cat] = df_copy[cat].astype(str) # fig, ax = plt.subplots() model = prince.FAMD() famd = model.fit(df_copy) coordinates = famd.transform(df_copy) famd.plot_row_coordinates(df_copy, color_labels=df_copy.kmed) st.pyplot() if st.sidebar.checkbox("Interest Vars", False): interest_vars # option for var cluster visualization plot_var = st.sidebar.selectbox("Variable", analysis_vars) # option for viewing information from analysis var dict about var analysis notes notes = st.sidebar.checkbox("Cluster Notes") if plot_var not in cat_vars: st.sidebar.header("Plot Type:") box = st.sidebar.checkbox("Boxplot") violin = st.sidebar.checkbox("Violin Plot") hist = st.sidebar.checkbox("Histogram") if hist: cluster_num = st.sidebar.selectbox("Cluster #", df.kmed.unique()) if st.sidebar.button("Plot", False): st.subheader(plot_var) st.write(var_dict[plot_var][0]) if notes: st.subheader("Analysis Notes") st.write(analysis_dict[plot_var]) if violin: plot_violin(plot_var) if box: plot_boxplot(plot_var) if hist: st.write(f"Cluster {cluster_num}, {plot_var}") plot_hist(plot_var, cluster_num) # if plot_var == "AFP": # st.image(\U'c:\Users\tayma\github\hcc_clustering\afp_table.png') else: if st.sidebar.button("Plot", False): st.write(var_dict[plot_var][0]) if notes: st.subheader("Analysis Notes") st.write(analysis_dict[plot_var]) plot_barplot(plot_var) # sidebar selectbox with info about normal ranges/values for health data # info_var = st.sidebar.selectbox("Var Info", cols) #========================================================================= if option == "Objective": st.subheader("Clustering Model Objective") st.write(''' The Hepatocellular Carcinoma dataset (HCC dataset) was collected at a University Hospital in Portugal. It contains real clinical data of 165 patients diagnosed with HCC. The purpose of my clustering model will be to create clusters that have a distribution of the mortality class average that will allow for cluster analysis to identify features of interest used in determining the effect on mortality rates in patients with HCC. ''') st.subheader("End User Value:") st.write(''' The value in the clustering analysis of the HCC dataset is providing additional insight into the ideal characteristics that compose the clusters having a lower overall mortality rate relative to the higher mortality groups. ''') st.subheader("Quantifiable Results:") st.write(''' Results are assessed based on the distribution of mortality rates across cluster groups. The dataset provides a target variable; however, clustering is done without introducing the target variable to retain efficacy for clustering of future data. The metric will be primarily the mortality class distribution as the silhouette scores, and other clustering metrics are ineffective with this particular dataset. ''') st.subheader("Visuals:") st.write(''' I use FAMD dimensionality reduction to visualize clusters. I chose FAMD because of the balance of categorical and quantitative variables in the dataset. ''') st.subheader("Results:") st.write(''' The objective is to find what distinguishing markers make up the higher and lower mortality groups easily distinguished from the rest of the clusters. The ideal number of groups (3-6) because of the need for a distinguishable range of mortality averages for each cluster showing a distinctive pattern across groups. ''') #========================================================================= if option == "Cluster Predict": st.subheader("Cluster Predictor") df = df.drop(columns=["Class", "kmed"]) cols = df.columns.to_list() cols_list = [] input = [] for i in range(len(cols)): # cols[i] cols_list.append(cols[i]) if cols[i] in yes_no: col = st.radio(cols[i], ["yes", "no"]) if col == "yes": col = 1 else: col = 0 input.append(col) elif cols[i] in cat_cols: st.write(var_dict[cols[i]]) col = st.slider(cols[i], min_value=df[cols[i]].min(), max_value=df[cols[i]].max()) input.append(col) # for col in num_cols: elif cols[i] in num_cols: col = st.number_input(cols[i]) input.append(col) else: col = st.radio(cols[i], ["male", "female"]) if col == "male": col = 1 else: col = 0 input.append(col) # input # input = [ # 1, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 0, # 26, # 13, # 0, # 1, # 2, # 1, # 1.15, # 165, # 12.5, # 94, # 950, # 73000, # 3.64, # 1.86, # 49, # 59, # 142, # 154, # 10.45, # 0.97, # 2, # 5.2, # 0.58, # 99, # 47, # 340 # ] input_df = pd.DataFrame(input) input_df = input_df.T input_df.columns = cols_list # input_df # input X = input_df k = 5 if st.button("Predict Cluster", False): # output = kmed_predict(df, input_df) # output.head(-1) # def kmed_predict(df, X, k=5): # generate k random indices from distance matrix # df = df.drop(columns="Class") # X = X.drop(columns="kmed") # X = pd.DataFrame(X, columns=df.columns) # X # df df = df.append(X, ignore_index=True) # df X_scaled = scale_df(df) dmat = create_dmat(X_scaled) np.random.seed(42) n_rows = dmat.shape[0] init_medoids = np.random.randint(0, n_rows, k) # init_medoids kmed = kmedoids(dmat, initial_index_medoids=init_medoids, data_type="distance_matrix") kmed.process() medoid_idxs = kmed.get_medoids() # medoid_idxs labels = kmed.predict(dmat) df["kmed"] = labels # print(df.kmed.value_counts()) # group_df = df.groupby("kmed").mean().sort_values("Class").style.background_gradient() # casting kmed clusters to strings df.kmed = df.kmed.astype(str) # reordering cluster numbers by mortality rate df.loc[(df.kmed == "3"), "kmed"] = 4 df.loc[(df.kmed == "0"), "kmed"] = 2 df.loc[(df.kmed == "1"), "kmed"] = 3 df.loc[(df.kmed == "4"), "kmed"] = 1 df.loc[(df.kmed == "2"), "kmed"] = 0 # clust = [2, 4, 0, 1, 3] # label = kmed.predict(X) # X['kmed'] = clust.index(label) group_df = df.groupby("kmed").mean().style.background_gradient() # counts = df.kmed.value_counts().index.sort_values(ascending=False) # group_df["count"] = counts output = df.iloc[165] st.write(f"Risk Cluster: {output['kmed']}") st.dataframe(output)
def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'): ''' :param data: DataFrame :param label: label column name in the data :param n_feats: the number of features be used to analysis. :param task: regression or classification :return: ''' columns = data.columns.tolist() columns.remove(label) if id is not None: if columns[id].duplicated().sum(): print('{} is duplicated !!!'.format(id)) columns.remove(id) data.drop(id, axis=1, inplace=True) numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes] numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v] category_names = list(set(columns) - set(numeric_names)) if task == 'classification': if len(category_names): # data distribution for each class new_data = data.dropna(axis=0) famd = prince.FAMD( n_components=2, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42 ) famd = famd.fit(new_data[columns]) ax = famd.plot_row_coordinates( new_data, ax=None, x_component=0, y_component=1, labels=new_data.index, color_labels=['{}'.format(t) for t in new_data[label]], ellipse_outline=False, ellipse_fill=True, show_points=True ) plt.show() else: new_data = data.dropna(axis=0) pca = PCA(n_components=2, random_state=seed) X_pca = pca.fit_transform(new_data[columns]) sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data) plt.show() # sort features for correlation plot sorted_feat_name = numeric_names if len(numeric_names) > 6: n_clusters = 3 new_data = data[[label] + numeric_names].dropna(axis=0) new_data_feat = new_data[numeric_names] new_data_stand = StandardScaler().fit_transform(new_data_feat) kmean_init = KMeans(n_clusters=n_clusters, random_state=seed) new_data_kmean=kmean_init.fit_transform( new_data_stand.reshape(len(numeric_names), -1)) sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1]) sorted_feat_name = [i[0] for i in sorted_feat] # correlation plot for all features sns.heatmap(data[[label] + sorted_feat_name + category_names].corr()) plt.show() # outlier detection just for numeric features outlier = data[numeric_names].apply(mad_based_outlier) for i, column in enumerate(outlier.columns): print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]])) # missing value pattern plot for all features msno.matrix(data[columns[:n_feats]]) plt.show() msno.bar(data[columns[:n_feats]]) plt.show() miss_data = data[columns[:n_feats]].isnull().sum(axis=1) miss_data = miss_data.to_frame() miss_data.columns = ['number_of_missing_attributes'] miss_data.sort_values('number_of_missing_attributes', inplace=True) miss_data['index'] = list(range(0, miss_data.shape[0])) sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data) plt.show()
def test_only_categorical(self): famd = prince.FAMD() X = self.X.select_dtypes(exclude=np.number) with self.assertRaises(ValueError): famd.fit(X) famd.transform(X)
dim_red = False if dim_red: import prince # Here you can choose between PCA and FAMD pca = True if pca: # One-hot encoding dummies = pd.get_dummies(comb) pca = prince.PCA(n_components=50) pca = pca.fit(dummies) expl = (pca.explained_inertia_) cum = (np.cumsum(expl))[-1] print("Explained variance " + str(cum)) dummies = pca.transform(dummies) else: famd = prince.FAMD(n_components=50) famd = famd.fit(comb) expl = (famd.explained_inertia_) cum = (np.cumsum(expl))[-1] print("Explained variance " + str(cum)) comb = famd.transform(comb) # One-hot encoding dummies = pd.get_dummies(comb) #%% """ Split combined data """ if not dim_red: dummies = pd.get_dummies(comb) X_train = dummies.iloc[:len(train0)] #print(X_train.shape) y_train = y
def test_fit_pandas_dataframe(self): famd = prince.FAMD() self.assertTrue(isinstance(famd.fit(self.X), prince.FAMD))
def test_only_numerical_numpy(self): famd = prince.FAMD() X = self.X.select_dtypes(np.number) with self.assertRaises(ValueError): famd.fit(X.to_numpy()) famd.transform(X.to_numpy())
mca.explained_inertia_ xlabels = ['{}'.format(t) for t in range(1, 16)] plt.bar(xlabels, mca.explained_inertia_[:15]) plt.xlabel("Axe") plt.title("Explained Inertia Ratio", fontsize=20) """ FAMD """ X = results2.iloc[:, 2:12] X[['Type 1', 'Type 2', 'Generation', 'Legendary']] = X[['Type 1', 'Type 2', 'Generation', 'Legendary']].astype('category') famd = prince.FAMD(n_components=10, n_iter=3, copy=True, check_input=True, engine='auto', random_state=42) famd = famd.fit(X) famd.row_coordinates(X) ax = famd.plot_row_coordinates( X, ax=None, figsize=(6, 6), x_component=0, y_component=1, color_labels=['{}'.format(t) for t in results2["win"]], ellipse_outline=False, ellipse_fill=True,
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, use_famd=False, seed=None): ''' Perform dimension reduction into a continuous r dimensional space and determine the init coefficients in that space y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data k (1d array): The number of components of the latent Gaussian mixture layers r (int): The dimension of latent variables nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable var_distrib (p 1darray): An array containing the types of the variables in y use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the first continuous latent variable. Otherwise MCA is used. seed (None): The random state seed to use for the dimension reduction --------------------------------------------------------------------------------------- returns (dict): All initialisation parameters ''' L = len(k) numobs = len(y) S = np.prod(k) #============================================================== # Dimension reduction performed with MCA #============================================================== if type(y) != pd.core.frame.DataFrame: raise TypeError('y should be a dataframe for prince') if (np.array(var_distrib) == 'ordinal').all(): print('PCA init') pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\ rescale_with_std=True, copy=True, check_input=True, engine='auto',\ random_state = seed) z1 = pca.fit_transform(y).values elif use_famd: famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \ engine='auto', random_state = seed) z1 = famd.fit_transform(y).values else: # Check input = False to remove mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\ check_input=False, engine='auto', random_state = seed) z1 = mca.fit_transform(y).values z = [z1] y = y.values #============================================================== # Set the shape parameters of each data type #============================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')].astype(int) nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int) nj_ord = nj[var_distrib == 'ordinal'] nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'] nb_categ = len(nj_categ) # Set y_count standard error to 1 y_cont = y[:, var_distrib == 'continuous'] # Before was np.float y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True) nb_cont = y_cont.shape[1] #======================================================= # Determining the Gaussian Parameters #======================================================= init = {} eta = [] H = [] psi = [] paths_pred = np.zeros((numobs, L)) for l in range(L): params = get_MFA_params(z[l], k[l], r[l:]) eta.append(params['eta'][..., n_axis]) H.append(params['H']) psi.append(params['psi']) z.append(params['z_nextl']) paths_pred[:, l] = params['classes'] paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0) paths, nb_paths = add_missing_paths(k, paths, nb_paths) w_s = nb_paths / numobs w_s = np.where(w_s == 0, 1E-16, w_s) # Check all paths have been explored if len(paths) != S: raise RuntimeError('Real path len is', S, 'while the initial number', \ 'of path was only', len(paths)) w_s = w_s.reshape(*k).flatten('C') #============================================================= # Enforcing identifiability constraints over the first layer #============================================================= H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) init['eta'] = eta init['H'] = H init['psi'] = psi init['w_s'] = w_s # Probabilities of each path through the network init['z'] = z # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) init[ 'classes'] = paths_pred[:, clustering_layer] # 0 To change with clustering_layer_idx #======================================================= # Determining the coefficients of the GLLVM layer #======================================================= # Determining lambda_bin coefficients. lambda_bin = np.zeros((nb_bin, r[0] + 1)) for j in range(nb_bin): Nj = np.max(y_bin[:, j]) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:, j] z_new = z[0] else: # If not, need to convert Binomial output to Bernoulli output yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0]) lr = LogisticRegression() if j < r[0] - 1: lr.fit(z_new[:, :j + 1], yj) lambda_bin[j, :j + 2] = np.concatenate( [lr.intercept_, lr.coef_[0]]) else: lr.fit(z_new, yj) lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]]) ## Identifiability of bin coefficients lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0] # Determining lambda_ord coefficients lambda_ord = [] for j in range(nb_ord): Nj = len(np.unique( y_ord[:, j], axis=0)) # The support of the jth ordinal is [1, Nj] yj = y_ord[:, j] ol = OrderedLogit() ol.fit(z[0], yj) ## Identifiability of ordinal coefficients beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten() lambda_ord_j = np.concatenate([ol.alpha_, beta_j]) lambda_ord.append(lambda_ord_j) # Determining the coefficients of the continuous variables lambda_cont = np.zeros((nb_cont, r[0] + 1)) for j in range(nb_cont): yj = y_cont[:, j] linr = LinearRegression() if j < r[0] - 1: linr.fit(z[0][:, :j + 1], yj) lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_], linr.coef_]) else: linr.fit(z[0], yj) lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_]) ## Identifiability of continuous coefficients lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0] # Determining lambda_categ coefficients lambda_categ = [] for j in range(nb_categ): yj = y_categ[:, j] lr = LogisticRegression(multi_class='multinomial') lr.fit(z[0], yj) ## Identifiability of categ coefficients beta_j = lr.coef_ @ AT[0][0] lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j])) init['lambda_bin'] = lambda_bin init['lambda_ord'] = lambda_ord init['lambda_cont'] = lambda_cont init['lambda_categ'] = lambda_categ return init
-------------------------------------------------------------------------------- Dimenasionality reduction and visualize segmentation for bad/good clients Since data is still high in dimensions. We first reduce dimensionality using FAMD because data has continuous as well as categorical data. -------------------------------------------------------------------------------- ''' # FAMD demands categorical variables to be identified as such df_famd = X df_famd['bad_clients'] = y.iloc[:,0] df_famd['descrStatus'] = y.iloc[:,1] for col in df_famd.select_dtypes(include=['uint8']).columns: df_famd[col] = df_famd[col].astype('category') famd = prince.FAMD(n_components=5, n_iter=100, copy=True, check_input=False, engine='sklearn', random_state=1) famd = famd.fit(df_famd.drop(columns=['bad_clients', 'descrStatus'], axis=1)) famd.explained_inertia_ # first graph decpits all descStatus categories for clients while the second graph compares bad clients #(i.e. those who either never repaid the loan or paid it after more than 14 days) with the rest of the bad_clients # Client type 1: bad clients # #from the FAMD analysis it's can be seen cluters cannot be reduced visually in a convenient way. This means that when # choosing a model one must consider that clients won't segment as neatly as hoped fig, ax = plt.subplots(1,2, figsize=(10,8)) famd.plot_row_coordinates(df_famd, x_component=0, y_component=1, color_labels=['Client type {}'.format(t) for t in df_famd['descrStatus']], ellipse_outline=False, ellipse_fill=True, show_points=True, ax=ax[0])
def test_only_numerical(self): famd = prince.FAMD() X = self.X.select_dtypes(np.number) with self.assertRaises(ValueError): famd.fit(X)
def test_only_categorical(self): famd = prince.FAMD() X = self.X.select_dtypes(exclude=np.number) self.assertRaises(ValueError, lambda: famd.fit(X))