def kernel(table, gamma=1, dbtype='mixed'): if dbtype == 'mixed': dist = gower.gower_matrix(table) # kernel = np.power(table.shape[0], -gamma*dist) kernel = np.exp(-gamma * dist) # kernel = rbf_kernel(table, gamma) # kernel = np.power(dist, np.shape(table)[0]) return kernel
def _define_noise_examples(self): distances = gower.gower_matrix(self._attrs) min_distances = [ self._cal_dNN(distances, indx) for indx in range(self._labels.shape[0]) ] min_distances = pd.DataFrame(min_distances, columns=["distance"]) min_distances.sort_values("distance", ascending=False, inplace=True) return list(min_distances[:self._num_noise].index)
def kernel_function(proto_critic, input): gamma = 1 kernel = np.array([]) for idx in range(input.shape[0]): dist = gower.gower_matrix(data_x=proto_critic, data_y=np.reshape(input[idx, :], (1, -1))).reshape((-1)) if idx == 0: kernel = np.exp(-gamma * dist) else: kernel = np.vstack((kernel, np.exp(-gamma * dist))) return kernel
def calc_gowers(df, continuous_columns): """Function to simplify calculating Gower's distance Args: df (pandas.DataFrame): dataframe of observations for which to calculate Gower's dstance \n continuous_columns (list): list of integers identifying the indexes of columns that are continuous Returns: gow_dists (numpy.array): a numpy array of Gower's distances between observations """ cat_list = make_categorical_list(continuous_columns, len(df.columns) - 5) data_np = df.iloc[:, 5:].to_numpy() gow_dists = gower.gower_matrix(data_np, cat_features=cat_list) return gow_dists
def calculate_distance(df): # list of non boolean columns that require preprocessing non_boolean_cols = [ 'idade_empresa_anos', 'idade_maxima_socios', 'idade_media_socios', 'idade_minima_socios', 'qt_filiais', 'qt_socios', 'qt_socios_st_regular' ] # normalizing the non boolean columns df = min_max_col(df, non_boolean_cols) # calculating the gower distance matrix dissimilarity_matrix = gower.gower_matrix(df) return dissimilarity_matrix
def N1(X, y, cat_features=[]): """ Calculate Fraction of Borderline Points (N1) - X: ndarray features - y: ndarray target - cat_features: a boolean array that specifies categorical features """ if len(cat_features) == 0: cat_features = np.zeros(X.shape[-1], dtype=bool) # Calculate Gower distance matrix distance_matrix = gower.gower_matrix(X, cat_features=cat_features) # Generate a Minimum Spanning Tree tree = MST(distance_matrix) sub = tree[y[tree[:, 0]] != y[tree[:, 1]]] vertices = np.unique(sub.flatten()) return len(vertices) / X.shape[0]
def cluster(df, random_state=None): """Use Gower distances and K Medioids to cluster the data from between 2 to 8 clusters. Uses silhouette analysis to determine optimal number of clusters Args: df (pandas.DataFrame): The data in a pandas dataframe random_state (int): Can be sued to fix the random state - ideal for testing Returns: (array): an array of the cluster assignments (int): the number of clusters used """ # Compute the Gower distance matrix # NOTE:large datasets will cause slow processing since array size is n2 matrix = gower.gower_matrix(df) # Use silhouette analysis to determine the optimal number of clusters # between 2 and 8 clusters res = [] for k in range(2, 9): #must have enough samples ie. k-1 if k < len(matrix) - 1: k_medoids = KMedoids(n_clusters=k, random_state=random_state).fit(matrix) # Catch exceptions here and set the score to -1 (worst) try: silhouette_avg = silhouette_score(df, k_medoids.labels_) res.append([k, silhouette_avg]) # If only one cluster causes an error so give worst score to this k except ValueError: res.append([k, -1]) # Best cluster has the value closest to 1 from the range -1 to 1 best_cluster = max(res, key=lambda x: x[1]) # Refit with the best number of clusters k_medoids = KMedoids(n_clusters=best_cluster[0], random_state=random_state).fit(matrix) return k_medoids.labels_, best_cluster[0]
def test_answer(): Xd = pd.DataFrame({ 'age': [21, 21, 19, 30, 21, 21, 19, 30, None], 'gender': ['M', 'M', 'N', 'M', 'F', 'F', 'F', 'F', None], 'civil_status': [ 'MARRIED', 'SINGLE', 'SINGLE', 'SINGLE', 'MARRIED', 'SINGLE', 'WIDOW', 'DIVORCED', None ], 'salary': [ 3000.0, 1200.0, 32000.0, 1800.0, 2900.0, 1100.0, 10000.0, 1500.0, None ], 'has_children': [1, 0, 1, 1, 1, 0, 0, 1, None], 'available_credit': [2200, 100, 22000, 1100, 2000, 100, 6000, 2200, None] }) Yd = Xd.iloc[1:3, :] X = np.asarray(Xd) Y = np.asarray(Yd) aaa = gower.gower_matrix(X) assert aaa[0][1] == pytest.approx(0.3590238, 0.001)
cat_features = [] for col in df.columns: if col not in num_features: cat_features.append(col) # scale standardization of numerical values df_num = pd.DataFrame(StandardScaler().fit_transform(df[num_features]),columns=num_features) df_cat = df.drop(columns=num_features) df_std = df_cat.merge(df_num,left_index=True,right_index=True,how='left') df_w_name = df_std df_w_name.head() df_std = df.set_index('DrinkName') # generate similarity matrix distance_matrix = gower.gower_matrix(df_std) #create complete linkage Zd = linkage(distance_matrix,method='complete') # hierarchical clustering visulization fig,axs = plt.subplots(1,1,figsize=(25,5)) dn = dendrogram(Zd, truncate_mode='level',p=6,show_leaf_counts=True,ax=axs) # find optimal k clusters results = {} for k in range(2,12): cluster_array = fcluster(Zd,k,criterion='maxclust') score = silhouette_score(distance_matrix,cluster_array,metric='precomputed') results[k] = score plt.plot([i for i in results.keys()],[j for j in results.values()],label='gower')
from scipy.cluster.hierarchy import fcluster link = shc.linkage(rez[['Category','Region','Company Age','Number of Employees','6 Month Growth','Number of Investors','Supported Languages','Price Availability','Price Range','Discount for Smallest Package','Discount for Biggest Package','Monthly Subscription','Yearly Subscription','Localization','Customization','Freemium','Free Trial','Number of Versions','Segmentation','Per Feature','Per User','One Time','Pay As You Go','Volume-based Price','Fixed Price']], method='ward') rez['cluster'] = fcluster(link,10.2,criterion = 'distance') rez.groupby('cluster').mean() # In[590]: import gower import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import linkage, fcluster, dendrogram dm = gower.gower_matrix(rez[['Price Availability','Price Range','Discount for Smallest Package','Discount for Biggest Package','Monthly Subscription','Yearly Subscription','Localization','Customization','Freemium','Free Trial','Number of Versions','Segmentation','Per Feature','Per User','One Time','Pay As You Go','Volume-based Price','Fixed Price']]) Zd = linkage(dm) # In[591]: plt.figure(figsize=(40, 50), dpi= 80) plt.title("Дендограмма", fontsize=30) dendrogram(Zd) plt.xticks(fontsize=10) plt.show() # In[483]:
# combinedDataset_allUsers.csv if the file created for the experiment one from 'output.py' df = pandas.read_csv(f"{current_directory}\\combinedDataset_allUsers.csv") df = df.drop( ["level", "temperature", "voltage", "status", "health", "output"], axis=1) # I want Phone Usage df["Connectivity"] = df["Cellular"] | df["WiFi"] df = df.drop(["WiFi", "Cellular", "isInteractive"], axis=1) # Cause of correlation between columns # Uncomment for dataframe description # #print(df.describe()) #plotCorr(df) # After the first run, you don't have to compute the distance matrix again. You can read it from the pickle file distance = gower.gower_matrix(df) with open("distance.pickle", "wb") as f: pickle.dump(distance, f) #distance = pickle.load( open( "distance.pickle", "rb" ) ) print("Done with distance!") del df # Compute the condensed distance matrix for the linkage and cophenetic condensedDst = squareform(distance) #del distance ''' From the linkage function Definition Methods 'centroid', 'median' and 'ward' are correctly defined only if Euclidean pairwise metric is used. If `y` is passed as precomputed pairwise distances, then it is a user responsibility to assure that these distances are in fact Euclidean, otherwise the produced result will be incorrect. '''
le_dict[col] = deepcopy(le) nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') # Feature category (cf) dtype = { train.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p) } train = train.astype(dtype, copy=True) numobs = len(train) # Defining distances over the features dm = gower_matrix(train, cat_features=cat_features) #***************************************************************** # Sampling rules #***************************************************************** authorized_ranges = np.expand_dims( np.stack([[-np.inf, np.inf] for var in var_distrib]).T, 1) if sub_design == 'bivarié': # Want to sample only women of more than 60 years old authorized_ranges[:, 0, 0] = [60, 100] # Of more than 60 years old # Keep only women sex_idx = np.argmax(varnames == 'sex') women_idx = np.argmax(le_dict['sex'].classes_ == 'Female')
def categorical_gower(): print('using categorical data - Gower', end=' ') categorical_df = pd.DataFrame(gower_matrix(videos_df.loc[:, category_columns], cat_features = [True for v in category_columns])).set_index(videos_df.index) print('> added {} columns'.format(len(categorical_df.columns))) return categorical_df
# Feature category (cf) cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical', vd_categ_non_enc == 'bernoulli') # Non encoded version of the dataset: #y_nenc_typed = y_categ_non_enc.astype(np.object) #y_np_nenc = y_nenc_typed.values dtype = {y.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \ (var_distrib[j] != 'categorical') else str for j in range(p_new)} y = y.astype(dtype, copy=True) # !!! Defining distances over the non encoded features dm = gower_matrix(y, cat_features=cf_non_enc) #===========================================# # Running the algorithm #===========================================# nb_pobs = 100 # Target for pseudo observations r = np.array([2, 1]) numobs = len(y) k = [n_clusters] seed = 1 init_seed = 2 eps = 1E-05 it = 50
import numpy as np import gower from sklearn.neighbors import DistanceMetric from scipy.sparse import issparse df = pd.read_csv( "C:/Users/ahhua/Documents/Github/capstone/Data/processed_data.csv") df = df.drop([df.columns[0]], axis=1) df = df.drop(['Title', 'Size', 'Link', 'Filename', 'Src'], axis=1) cate = [0] cate.extend([1] * 43) cate.extend([0, 1, 0, 0, 0, 0]) weights = [2] + (43 * [1]) + (6 * [5]) print('q') g = gower.gower_matrix(df, weight=np.array(weights), cat_features=cate) print('r') pred = gower.gower_topn(df.iloc[0:2, :], df.iloc[:, ], n=5, weight=np.array(weights), cat_features=cate) print(pred) ''' area = DistanceMetric.get_metric('manhattan').pairwise(df[['Area']]) area = area/max(np.ptp(df['Area']),1) num_colors = DistanceMetric.get_metric('manhattan').pairwise(df[['num_colors']]) num_colors = num_colors/max(np.ptp(df['num_colors']),1) complexity = DistanceMetric.get_metric('manhattan').pairwise(df[['complexity']])
nan_mask = y.isnull() cat_features = var_distrib == 'categorical' y, le_dict = data_processing(y, var_distrib) y = y.where(~nan_mask, np.nan) nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') # Feature category (cf) dtype = {y.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p)} full_contra = full_contra.astype(dtype, copy=True) complete_y = y[~y.isna().any(1)].astype(dtype, copy=True) # Defining distance matrix dm = gower_matrix(complete_y, cat_features=cat_features) #===========================================# # Hyperparameters #===========================================# n_clusters = 4 nb_pobs = 100 # Target for pseudo observations r = np.array([2, 1]) numobs = len(y) k = [n_clusters] seed = 1 init_seed = 2 eps = 1E-05
def peptide_identification(args): print(datetime.now(), ': Peptid identification starts...') print('Settings: ') print(args) # PLATO setting subclusterCount = args.subclusterCount spy = args.spy spy_portion = args.spy_portion RN = args.RN rnd_all = args.rnd_all # If random method, include all decoys rnd_portion = args.rnd_portion # If random method, include rnd.portion of positive set, default 1: pos set = neg set replicates_cnt = args.replicates_cnt include_label = args.include_label AML_preprocess = args.AML_preprocess output_folder = args.output_folder # AutoML parameter setting autoML_best_model_selection = args.autoML_best_model_selection autoML_iterations = args.autoML_iterations metric = args.metric # Other metrics: azureml.train.automl.utilities.get_primary_metrics('classification') cv_fold = args.cv_fold # Input, output file_name = args.sample_name input_path = args.input_folder output_path = output_folder + '/' + file_name log_file = output_path + '_autoML_errors_log.html' # Instantiate AutoML config and create an experiment in autoML workspace ws = Workspace.from_config() experiment_name = file_name experiment = Experiment(ws, experiment_name) print(datetime.now(), ': Assigned experiment ' + experiment_name + ' on Azure portal ') output = {} output['SDK version'] = azureml.core.VERSION output['Workspace Name'] = ws.name output['Resource Group'] = ws.resource_group output['Location'] = ws.location outputDf = pd.DataFrame(data=output, index=['']) print(outputDf) print(datetime.now(), ': Reading inputs') # Read POSITIVES and ALL inputs positives_path = glob.glob(input_path + file_name + '*POSITIVES*') raw_positives = pd.read_csv(positives_path[0], sep='\t') if AML_preprocess == True: all_path = glob.glob(input_path + file_name + '-ALL.txt') raw_all = pd.read_csv(all_path[0], sep='\t') # Extract new features # First and last three amino acides of peptide sequences as features - If NA then B category raw_all['Peptide'] = raw_all.Peptide.str.replace( r'([\(\[]).*?([\)\]])', r'B', regex=True) raw_all['P1'] = raw_all['Peptide'].str[0] raw_all['P2'] = raw_all['Peptide'].str[2] raw_all['P3'] = raw_all['Peptide'].str[3] raw_all['P4'] = raw_all['Peptide'].str[-4] raw_all['P5'] = raw_all['Peptide'].str[-3] raw_all['P6'] = raw_all['Peptide'].str[-1] else: all_path = glob.glob(input_path + file_name + '_percolator_feature.txt') raw_all = pd.read_csv(all_path[0], sep='\t') raw_all['Class'] = 0 # Make positive and test set test_data = raw_all.drop(['ScanNr', 'Proteins'], axis=1) positive_set = pd.merge(left=pd.DataFrame(raw_positives['SpecId']), right=pd.DataFrame(test_data), how='left', left_on='SpecId', right_on='SpecId') positive_set['Class'] = 1 # Remove decoys in positive set, if there is any decoys_in_positive_idx = positive_set.index[positive_set['Label'] == -1].tolist() positive_set = positive_set[positive_set['Label'] != -1] # Dataframe to store predictions all_predictions = pd.DataFrame({ 'SpecId': list(test_data['SpecId']), 'Peptide': list(test_data['Peptide']), 'Label': list(test_data['Label']) }) prediction_summary = all_predictions # Prepare test set for modeling y_test = test_data['Class'] if include_label == True: X_test = test_data.drop(['SpecId', 'Peptide', 'Class'], axis=1) else: X_test = test_data.drop(['SpecId', 'Peptide', 'Label', 'Class'], axis=1) # Prepare positive set for modeling positive_set_idx = [ test_data['SpecId'].tolist().index(x) for x in positive_set['SpecId'].tolist() if x in test_data['SpecId'].tolist() ] # Used to create the negative set decoys_idx = np.setdiff1d( test_data.index[test_data['Label'] == -1].tolist(), decoys_in_positive_idx).tolist() global gower_dist_avg if RN == True: if os.path.exists(input_path + file_name + 'gower_dist_avg.npy') == False: print(datetime.now(), ': Calculating Gower distance') gower_dist = gower.gower_matrix(test_data) selected_rows = gower_dist[positive_set_idx] gower_dist_avg = np.mean(selected_rows, axis=0) print(datetime.now(), ': Saving Gower distance matrix') np.save(input_path + '/' + file_name + 'gower_dist_avg.npy', gower_dist_avg) # save else: print(datetime.now(), ': Loading Gower distance matrix from ', input_path + file_name + 'gower_dist_avg.npy') gower_dist_avg = np.load(input_path + file_name + 'gower_dist_avg.npy') # load if spy == True: all_spies = pd.DataFrame() ''' Create train set by concatinating positive and negative set, build model(s) using autoML and store predictions based on the best model ''' for rep in range(0, replicates_cnt): print(datetime.now(), ': Replicate #', rep + 1) if spy == True: # Exclude spy_portion of training data to be the spies positive_set = positive_set.sample(n=len(positive_set), random_state=rep * 100).reset_index(drop=True) spySet_size = round(len(positive_set) * spy_portion) spies_ID = positive_set.loc[1:spySet_size, ['SpecId']] positive_set_wSpy = positive_set.iloc[spySet_size + 1:len(positive_set)] if RN == False: if rnd_all == True: # Negative set includes all decoys negative_set_idx = decoys_idx else: # Negative set idx includes rnd_portion times of |positive_set| indecies random.seed(rep) random.shuffle(decoys_idx) negative_set_idx = decoys_idx[0:rnd_portion * len(positive_set)] else: print(datetime.now(), ': Starts estimating RNs') negative_set_idx = reliable_negative(test_data, positive_set, subclusterCount, rep) print(datetime.now(), ': Ends estimating RNs') negative_set = test_data.iloc[negative_set_idx] if spy == True: train_data = pd.concat([positive_set_wSpy, negative_set], axis=0) else: train_data = pd.concat([positive_set, negative_set], axis=0) y_train = train_data['Class'] if include_label == True: X_train = train_data.drop(['SpecId', 'Peptide', 'Class'], axis=1) else: X_train = train_data.drop(['SpecId', 'Peptide', 'Class', 'Label'], axis=1) print('Training set size:', len(y_train), '\nTest set size:', len(y_test)) automl_config = AutoMLConfig(task='classification', debug_log=log_file, primary_metric=metric, iteration_timeout_minutes=200, iterations=autoML_iterations, verbosity=logging.INFO, preprocess=AML_preprocess, X=X_train, y=y_train, n_cross_validations=cv_fold, model_explainability=True) print(datetime.now(), ': modeling replicate #' + str(rep + 1) + '...') local_run = experiment.submit(automl_config, show_output=True) if autoML_best_model_selection == False: # Retrieve the Best Model based on bunch of metrics children = list(local_run.get_children()) metricslist = {} for run in children: properties = run.get_properties() metrics = { k: v for k, v in run.get_metrics().items() if isinstance(v, float) } metricslist[int(properties['iteration'])] = metrics rundata = pd.DataFrame(metricslist).sort_index(1) tmp = rundata.T.sort_values([ 'AUC_weighted', 'f1_score_weighted', 'precision_score_weighted', 'recall_score_weighted', 'weighted_accuracy' ], ascending=False) rundata = tmp.sort_values('log_loss', ascending=True).T best_run_iteration = rundata.columns.values[0] rundata.to_csv(output_path + '_metrics_list_' + str(rep) + '.txt') best_run, fitted_model = local_run.get_output( iteration=best_run_iteration) else: best_run, fitted_model = local_run.get_output() print('Best run: ', best_run) print(datetime.now(), ': Saving best model and predictions') # Save the best model, prediction value and probability modelname = output_path + '_model_' + str(rep) + '.sav' joblib.dump(fitted_model, modelname) y_pred_val = fitted_model.predict(X_test) y_pred_prob = fitted_model.predict_proba(X_test) # Add the results of the replicate to all predictions table all_predictions['pred_rep' + str(rep)] = list(y_pred_val) all_predictions['prob_rep' + str(rep)] = list( [item[1] for item in y_pred_prob]) # Overwrite prediction values based on the spies cutoff if spy == True: threshold = min( pd.merge(spies_ID, all_predictions, on='SpecId')['prob_rep' + str(rep)]) all_predictions['pred_rep' + str(rep)] = np.where( all_predictions['prob_rep' + str(rep)] >= threshold, 1, 0) all_spies['SpecId' + str(rep)] = spies_ID['SpecId'] all_spies['Prob_rep' + str(rep)] = list( pd.merge(spies_ID, all_predictions, on=['SpecId'])['prob_rep' + str(rep)]) print(datetime.now(), ': Replicate #' + str(rep + 1) + ' processed!') all_predictions.to_csv(output_path + '_all_predictions.csv', index=False) if spy == True: all_spies.to_csv(output_path + '_all_spies.csv', index=False) print(datetime.now(), ': Generate prediction summary of all replicates') pred_col_indecies = [ col for col in all_predictions.columns if 'pred' in col ] prob_col_indecies = [ col for col in all_predictions.columns if 'prob' in col ] prediction_summary['Std'] = all_predictions[prob_col_indecies].std( skipna=True, axis=1) prediction_summary['Min'] = all_predictions[prob_col_indecies].min( skipna=True, axis=1) prediction_summary['Max'] = all_predictions[prob_col_indecies].max( skipna=True, axis=1) prediction_summary['Avg'] = all_predictions[prob_col_indecies].mean( skipna=True, axis=1) prediction_summary['Median'] = all_predictions[prob_col_indecies].median( skipna=True, axis=1) prediction_summary['Vote'] = all_predictions[pred_col_indecies].sum( skipna=True, axis=1) prediction_summary.to_csv(output_path + '_prediction_summary.txt', sep='\t', index=False) # Feature importance print(datetime.now(), ': Output feature importance of the best run') client = ExplanationClient.from_run(best_run) raw_explanations = client.download_model_explanation( top_k=len(X_test.columns)) print('Raw feature importance') print(raw_explanations.get_feature_importance_dict()) d = raw_explanations.get_feature_importance_dict() raw_feature_importance = pd.DataFrame(list(d.items())) raw_feature_importance.to_csv(output_path + '_raw_feature_importance.csv', index=False) # Engineered engineered_explanations = client.download_model_explanation( top_k=len(X_test.columns)) print('Engineered feature importance') print(engineered_explanations.get_feature_importance_dict()) d = engineered_explanations.get_feature_importance_dict() engineered_feature_importance = pd.DataFrame(list(d.items())) engineered_feature_importance.to_csv(output_path + '_engineered_feature_importance.csv', index=False) now = datetime.now() print(datetime.now(), ': Program end')
def get_gower_matrix(self): distances = gower.gower_matrix(self.data) distances = pd.DataFrame(distances, index=self.data.index) distances.columns = distances.index distances=distances.replace(0, 1000) return(distances)
import plotly.graph_objects as go import pandas as pd import numpy as np import gower import prince from sklearn import manifold from components import (ALL_STATS, BASIC_STATS, TYPES, GENS, DATA, BASE_COLS, OTHER, SIZE_COLS, CATEGORICAL_COLS_GOWER, NUMERICAL_COLS_GOWER) GOWER_MATRIX = gower.gower_matrix( DATA.fillna('None')[CATEGORICAL_COLS_GOWER + NUMERICAL_COLS_GOWER]) def update_stats_scatter(stats, types, generations, yaxis): labels = BASE_COLS + stats + OTHER df = DATA[labels] df = df[((df['Type 1'].isin(types)) | (df['Type 2'].isin(types))) & (df['Generation'].isin(generations))] df_long = pd.melt(df, id_vars=BASE_COLS + OTHER, value_vars=stats, var_name="Stat") if len(stats) == 1: fig = px.scatter( df,
def DDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' prev_lik = -1E16 best_lik = -1E16 tol = 0.01 max_patience = 1 patience = 0 best_k = deepcopy(k) best_r = deepcopy(r) best_sil = -1 new_sil = -1 # Initialize the parameters eta = deepcopy(init['eta']) psi = deepcopy(init['psi']) lambda_bin = deepcopy(init['lambda_bin']) lambda_ord = deepcopy(init['lambda_ord']) lambda_categ = deepcopy(init['lambda_categ']) H = deepcopy(init['H']) w_s = deepcopy( init['w_s'] ) # Probability of path s' through the network for all s' in Omega numobs = len(y) likelihood = [] it_num = 0 ratio = 1000 np.random.seed = seed # Dispatch variables between categories y_bin = y[:, np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')].astype(int) nb_bin = len(nj_bin) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'].astype(int) nb_ord = len(nj_ord) L = len(k) k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)]) M = M_growth(1, r, numobs) assert nb_ord + nb_bin + nb_categ > 0 # Compute the Gower matrix cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features=cat_features) while (it_num < it) & ((ratio > eps) | (patience <= max_patience)): print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) ##################################################################################### ################################# S step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for all s in Omega #===================================================================== mu_s, sigma_s = compute_path_params(eta, H, psi) sigma_s = ensure_psd(sigma_s) z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M) ''' print('mu_s', np.abs(mu_s[0]).mean()) print('sigma_s', np.abs(sigma_s[0]).mean()) print('z_s0', np.abs(z_s[0]).mean()) print('z_s1', np.abs(z_s[1]).mean(0)[:,0]) ''' #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== chsi = compute_chsi(H, psi, mu_s, sigma_s) chsi = ensure_psd(chsi) rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s = draw_z2_z1s(chsi, rho, M, r) #======================================================================= # Compute the p(y| z1) for all variable categories #======================================================================= py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord, lambda_categ, y_categ, nj_categ, z_s[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys = draw_zl1_ys(z_s, py_zl1, M) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute conditional probabilities used in the appendix of asta paper #===================================================================== pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s, py_zl1) #del(py_zl1) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S) pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s) #===================================================================== # Compute MFA expectations #===================================================================== Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \ E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute MFA Parameters #======================================================= w_s = np.mean(ps_y, axis=0) eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H, k) #======================================================= # Identifiability conditions #======================================================= # Update eta, H and Psi values H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del (Ez) #======================================================= # Compute GLLVM Parameters #======================================================= # We optimize each column separately as it is faster than all column jointly # (and more relevant with the independence hypothesis) lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(p_y)) likelihood.append(new_lik) ratio = (new_lik - prev_lik) / abs(prev_lik) print(likelihood) idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) psl_y = ps_y.reshape(numobs, *k, order='C').sum(idx_to_sum) temp_class = np.argmax(psl_y, axis=1) try: new_sil = silhouette_score(dm, temp_class, metric='precomputed') except ValueError: new_sil = -1 print('Silhouette score:', new_sil) if best_sil < new_sil: z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1) best_sil = deepcopy(new_sil) classes = deepcopy(temp_class) fig = plt.figure(figsize=(8, 8)) plt.scatter(z[:, 0], z[:, 1]) plt.show() # Refresh the classes only if they provide a better explanation of the data if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 2, r, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### is_not_min_specif = not (np.all(np.array(k) == n_clusters) & np.array_equal(r, [2, 1])) if look_for_simpler_network( it_num) & perform_selec & is_not_min_specif: r_to_keep = r_select(y_bin, y_ord, y_categ, zl1_ys, z2_z1s, w_s) # If r_l == 0, delete the last l + 1: layers new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1 k_to_keep = k_select(w_s, k, new_L, clustering_layer) is_L_unchanged = L == new_L is_r_unchanged = np.all( [len(r_to_keep[l]) == r[l] for l in range(new_L + 1)]) is_k_unchanged = np.all( [len(k_to_keep[l]) == k[l] for l in range(new_L)]) is_selection = not (is_r_unchanged & is_k_unchanged & is_L_unchanged) assert new_L > 0 if is_selection: eta = [eta[l][k_to_keep[l]] for l in range(new_L)] eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][k_to_keep[l]] for l in range(new_L)] H = [H[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)] psi = [psi[l][k_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)] if nb_bin > 0: # Add the intercept: bin_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1 ]) lambda_bin = lambda_bin[:, bin_r_to_keep] if nb_ord > 0: # Intercept coefficients handling is a little more complicated here lambda_ord_intercept = [ lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord ] Lambda_ord_var = np.stack( [lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord]) Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]] lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\ for j in range(nb_ord)] if nb_categ > 0: lambda_categ_intercept = [ lambda_categ[j][:, 0] for j in range(nb_categ) ] Lambda_categ_var = [ lambda_categ_j[:, -r[0]:] for lambda_categ_j in lambda_categ ] Lambda_categ_var = [ lambda_categ_j[:, r_to_keep[0]] for lambda_categ_j in lambda_categ ] lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\ for j in range(nb_categ)] w = w_s.reshape(*k, order='C') new_k_idx_grid = np.ix_(*k_to_keep[:new_L]) # If layer deletion, sum the last components of the paths if L > new_L: deleted_dims = tuple(range(L)[new_L:]) w_s = w[new_k_idx_grid].sum(deleted_dims).flatten( order='C') else: w_s = w[new_k_idx_grid].flatten(order='C') w_s /= w_s.sum() k = [len(k_to_keep[l]) for l in range(new_L)] r = [len(r_to_keep[l]) for l in range(new_L + 1)] k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)]) L = new_L patience = 0 best_r = deepcopy(r) best_k = deepcopy(k) # Identifiability conditions H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S', S) print("w_s", len(w_s)) prev_lik = deepcopy(new_lik) it_num = it_num + 1 out = dict(likelihood = likelihood, classes = classes, z = z, \ best_r = best_r, best_k = best_k) return (out)
def post(self,request): # request = self.request.data null=np.nan # stores = request['data']['teststores'] # ######## filename = request.FILES['store_mstr'] if filename: filename_check = filename.name ext = [".xlsx", ".csv", ".xls"] if filename_check.endswith(tuple(ext)): extesnion = os.path.splitext(filename_check)[1] if (extesnion == '.csv'): data = pd.read_csv(filename) else: teststores = pd.read_excel(filename) store_features =request.POST['store_features'] # stores =request.POST['teststores'] ############## # stores = pd.DataFrame(eval(stores)) # teststores = pd.DataFrame.from_dict(stores, orient='columns') # store_features =request['data']['store_features'] if teststores is not None: mandatory_features = ['Banner', 'Outlet_surface', 'Shelf_meters_Choc','Shelf_meters_Dog', 'Shelf_meters_Cat', 'Influence_Overall','CSV_of_outlet'] # store_features = mandatory_features + store_features # stores_master_df = pd.read_excel("datas/TL_StoreMstr.xlsx") # test_master_df = pd.read_excel("TL_TestMstr.xlsx") # teststore_map_df = pd.read_excel("TL_Teststore_map.xlsx") # controlstore_map_df = pd.read_excel("TL_Controlstore_Mstr.xlsx") Allstores = StoreMstr.objects.filter(is_active=True,is_deleted=False) Teststores = TestStoreSerializer(Allstores,many=True) stores_master_df = pd.DataFrame(Teststores.data) Alltest = TestMstr.objects.filter(is_active=True,is_deleted=False) Testmst = TestSerializer(Alltest,many=True) test_master_df = pd.DataFrame(Testmst.data) # test_master_df = pd.read_excel("datas/TL_TestMstr.xlsx") if(test_master_df.empty): columns = ['test_id','test_name','test_desc','testtype','target_var','territory_name','store_segment','category_name','confidence_lev','margin_oferror','std_deviation','pre_start','pre_end','test_window','testwin_start','testwin_end','stage_id','created_on','modified_on','is_active','deleted_at','is_deleted'] test_master_df = pd.DataFrame(columns=columns) Alltestmap = TestStoreMap.objects.filter(is_active=True,is_deleted=False) Teststop = TestStoreMapSerializer(Alltestmap,many=True) teststore_map_df = pd.DataFrame(Teststop.data) # teststore_map_df = pd.read_excel("datas/TL_Teststore_map.xlsx") controlstore = ControlStoreMstr.objects.filter(is_active=True,is_deleted=False) controlstores = ControlstoreSerializer(controlstore,many=True) controlstore_map_df = pd.DataFrame(Teststop.data) # Eliminating the Invalid Stores From Population stores_master_df = check_if_store_valid(storesfile=stores_master_df) stores_master_df = stores_master_df[stores_master_df["Is Valid Store"]==1] # ELIMINATING ALL THE TEST AND CONTROL STORES WHICH ARE CURRENTLY ACTIVE # Get all the active tests(test ids) active_tests_df = test_master_df[test_master_df["is_active"]==True] if active_tests_df.shape[0] != 0: # THIS MEANS THERE ARE ACTIVE TESTS AND WE NEED TO ELIMINATE ACTIVE TEST AND CONTROL STORES #Get the active test stores using test ids active_test_stores = teststore_map_df[teststore_map_df["test_id"].isin(active_tests_df["test_id"])] #Get the active control stores using test ids active_control_stores = controlstore_map_df[controlstore_map_df["test_id"].isin(active_tests_df["test_id"])] #Remove active test and control stores from population stores_master_df = stores_master_df[~stores_master_df["store_id"].isin(active_test_stores["store_id"])] stores_master_df = stores_master_df[~stores_master_df["store_id"].isin(active_control_stores["store_id"])] else: # THIS MEANS THERE ARE NO ACTIVE TESTS AND WE NEED NOT ELIMINATE ANY TEST AND CONTROL STORES pass # ELIMINATING THE TESTSTORES FROM POPULATION stores_master_df = stores_master_df[~(stores_master_df["Partner_ID"].isin(teststores["Partner_ID"]))] refA = teststores.copy(deep=True) refB = stores_master_df.copy(deep=True) useA = refA[store_features].copy(deep=True) useB = refB[store_features].copy(deep=True) scaler = StandardScaler() scale_cols = [item for item in store_features if item!="Banner"] useA[scale_cols] = scaler.fit_transform(useA[scale_cols]) useB[scale_cols] = scaler.fit_transform(useB[scale_cols]) gowermatrix = gower.gower_matrix(useA,useB) # Identifying similar stores df_list = list() for test_pid,row in zip(refA["Partner ID"],gowermatrix): df = refB.copy(deep=True) df["Gower_Distance"] = list(row) df = df.sort_values(by="Gower_Distance",ascending=True) df["Test store Partner ID"] = test_pid df["Gower_Distance"] = df["Gower_Distance"].apply(lambda x:round(x,2)) df["Similarity Measure"] = df["Gower_Distance"].apply(lambda x: 1-x) df_list.append(df.head(1)) control_stores = pd.concat(df_list) control_stores["Checked_Flag"] = 0 finalcontrol_stores = control_stores.reset_index().to_json(orient='records') return json.Response(finalcontrol_stores,True) else: return json.Response("Please pass Test stores",False)
cluster_telco_centroid = pd.Series(telco_centroid.labels_) telco_data["cluster"] = cluster_telco_centroid telco_data.iloc[:, 0:29].groupby(telco_data.cluster).mean() import os telco_data.to_csv("final_telco_data.csv", encoding="utf-8") os.getcwd() #using gowers clustering for mixed data import gower from scipy.cluster.hierarchy import fcluster, dendrogram gowers_matrix = gower.gower_matrix(telco_data) gowers_linkage = linkage(gowers_matrix) gcluster = fcluster(gowers_linkage, 3, criterion='maxclust') dendrogram(gowers_linkage) telco_data["cluster"] = gcluster telco_data.iloc[:, 0:29].groupby(telco_data.cluster).mean() import os telco_data.to_csv("final2_telco_data.csv", encoding="utf-8") os.getcwd() #############################Problem 4#################################################### import pandas as pd
def MDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing mixed variables n_clusters (int or str): The number of clusters to look for in the data or the use mode of the MDGMM r (dict): The dimension of latent variables through the first 2 layers k (dict): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable For categorical data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' # Break the reference link k = deepcopy(k) r = deepcopy(r) best_k = deepcopy(k) best_r = deepcopy(r) # Add other checks for the other variables check_inputs(k, r) prev_lik = - 1E15 best_lik = -1E15 tol = 0.01 max_patience = 1 patience = 0 #==================================================== # Initialize the parameters #==================================================== eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) numobs = len(y) likelihood = [] it_num = 0 ratio = 1000 np.random.seed = seed #==================================================== # Dispatch variables between categories #==================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj_bin.astype(int) nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'] nj_ord = nj_ord.astype(int) nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) yc = y[:, var_distrib == 'continuous'] ss = StandardScaler() yc = ss.fit_transform(yc) nb_cont = yc.shape[1] # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} best_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 new_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 M = M_growth(1, r_1L, numobs) if nb_bin + nb_ord + nb_categ == 0: # Create the InputError class and change this raise ValueError('Input does not contain discrete variables,\ consider using a regular DGMM') if nb_cont == 0: # Create the InputError class and change this raise ValueError('Input does not contain continuous values,\ consider using a DDGMM') # Compute the Gower matrix cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features = cat_features) while (it_num < it) & ((ratio > eps) | (patience <= max_patience)): print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not(isnumeric(n_clusters)): if n_clusters == 'auto': clustering_layer = 0 elif n_clusters == 'multi': clustering_layer = list(range(L['t'] - 1)) else: raise ValueError('Please enter an int, auto or multi for n_clusters') else: assert (np.array(k['t']) == n_clusters).any() clustering_layer = np.argmax(np.array(k['t']) == n_clusters) ##################################################################################### ################################# MC step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for both heads and tail #===================================================================== mu_s_c, sigma_s_c = compute_path_params(eta_c, H_c, psi_c) sigma_s_c = ensure_psd(sigma_s_c) mu_s_d, sigma_s_d = compute_path_params(eta_d, H_d, psi_d) sigma_s_d = ensure_psd(sigma_s_d) z_s_c, zc_s_c, z_s_d, zc_s_d = draw_z_s_all_network(mu_s_c, sigma_s_c,\ mu_s_d, sigma_s_d, yc, eta_c, eta_d, S_1L, L, M) #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== # Create wrapper as before and after chsi_c = compute_chsi(H_c, psi_c, mu_s_c, sigma_s_c) chsi_c = ensure_psd(chsi_c) rho_c = compute_rho(eta_c, H_c, psi_c, mu_s_c, sigma_s_c, zc_s_c, chsi_c) chsi_d = compute_chsi(H_d, psi_d, mu_s_d, sigma_s_d) chsi_d = ensure_psd(chsi_d) rho_d = compute_rho(eta_d, H_d, psi_d, mu_s_d, sigma_s_d, zc_s_d, chsi_d) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s_c, z2_z1s_d = draw_z2_z1s_network(chsi_c, chsi_d, rho_c, \ rho_d, M, r_1L, L) #======================================================================= # Compute the p(y^D| z1) for all discrete variables #======================================================================= py_zl1_d = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord,\ lambda_categ, y_categ, nj_categ, z_s_d[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys_d = draw_zl1_ys(z_s_d, py_zl1_d, M['d']) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute quantities necessary for E steps of both heads and tail #===================================================================== # Discrete head quantities pzl1_ys_d, ps_y_d, py_d = E_step_GLLVM(z_s_d[0], mu_s_d[0], sigma_s_d[0], w_s_d, py_zl1_d) py_s_d = ps_y_d * py_d / w_s_d[n_axis] # Continuous head quantities ps_y_c, py_s_c, py_c = continuous_lik(yc, mu_s_c[0], sigma_s_c[0], w_s_c) pz_s_d = fz_s(z_s_d, mu_s_d, sigma_s_d) pz_s_c = fz_s(z_s_c, mu_s_c, sigma_s_c) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== # Compute pz2_z1s_d and pz2_z1s_d for the tail indices whereas it is useless pz2_z1s_d = fz2_z1s(t(pzl1_ys_d, (1, 0, 2)), z2_z1s_d, chsi_d, rho_d, S_1L['d']) pz_ys_d = fz_ys(t(pzl1_ys_d, (1, 0, 2)), pz2_z1s_d) pz2_z1s_c = fz2_z1s([], z2_z1s_c, chsi_c, rho_c, S_1L['c']) pz_ys_c = fz_ys([], pz2_z1s_c) pz2_z1s_t = fz2_z1s([], z2_z1s_c[bar_L['c']:], chsi_c[bar_L['c']:], \ rho_c[bar_L['c']:], S_1L['t']) # Junction layer computations # Compute p(zC |s) py_zs_d = fy_zs(pz_ys_d, py_s_d) py_zs_c = fy_zs(pz_ys_c, py_s_c) # Compute p(zt | yC, yD, sC, SD) pzt_yCyDs = fz_yCyDs(py_zs_c, pz_ys_d, py_s_c, M, S_1L, L) #===================================================================== # Compute MFA expectations #===================================================================== # Discrete head. Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, EeeT_ys_d = \ E_step_DGMM_d(zl1_ys_d, H_d, z_s_d, zc_s_d, z2_z1s_d, pz_ys_d,\ pz2_z1s_d, S_1L['d'], L['d']) # Continuous head Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, EeeT_ys_c = \ E_step_DGMM_c(H_c, z_s_c, zc_s_c, z2_z1s_c, pz_ys_c,\ pz2_z1s_c, S_1L['c'], L['c']) # Junction layers Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, EeeT_ys_t = \ E_step_DGMM_t(H_c[bar_L['c']:], \ z_s_c[bar_L['c']:], zc_s_c[bar_L['c']:], z2_z1s_c[bar_L['c']:],\ pzt_yCyDs, pz2_z1s_t, S_1L, L, k_1L) # Error here for the first two terms: p(y^h | z^t, s^C) != p(y^h | z^t, s^{1C:L}) pst_yCyD = fst_yCyD(py_zs_c, py_zs_d, pz_s_d, w_s_c, w_s_d, k_1L, L) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute DGMM Parameters #======================================================= # Discrete head w_s_d = np.mean(ps_y_d, axis = 0) eta_d_barL, H_d_barL, psi_d_barL = M_step_DGMM(Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, \ EeeT_ys_d, ps_y_d, H_d, k_1L['d'][:-1],\ L_1L['d'], r_1L['d']) # Add dispatching function here eta_d[:bar_L['d']] = eta_d_barL H_d[:bar_L['d']] = H_d_barL psi_d[:bar_L['d']] = psi_d_barL # Continuous head w_s_c = np.mean(ps_y_c, axis = 0) eta_c_barL, H_c_barL, psi_c_barL = M_step_DGMM(Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, \ EeeT_ys_c, ps_y_c, H_c, k_1L['c'][:-1],\ L_1L['c'] + 1, r_1L['c']) eta_c[:bar_L['c']] = eta_c_barL H_c[:bar_L['c']] = H_c_barL psi_c[:bar_L['c']] = psi_c_barL # Common tail eta_t, H_t, psi_t, Ezst_y = M_step_DGMM_t(Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, \ EeeT_ys_t, ps_y_c, ps_y_d, pst_yCyD, \ H_c[bar_L['c']:], S_1L, k_1L, \ L_1L, L, r_1L['t']) eta_d[bar_L['d']:] = eta_t H_d[bar_L['d']:] = H_t psi_d[bar_L['d']:] = psi_t eta_c[bar_L['c']:] = eta_t H_c[bar_L['c']:] = H_t psi_c[bar_L['c']:] = psi_t #======================================================= # Identifiability conditions #======================================================= w_s_t = np.mean(pst_yCyD, axis = 0) eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) #======================================================= # Compute GLLVM Parameters #======================================================= # We optimize each column separately as it is faster than all column jointly # (and more relevant with the independence hypothesis) lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y_d,\ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(py_d) + np.log(py_c)) likelihood.append(new_lik) ratio = (new_lik - prev_lik)/abs(prev_lik) if n_clusters == 'multi': temp_classes = [] z_tail = [] classes = [[] for l in range(L['t'] - 1)] for l in clustering_layer: idx_to_sum = tuple(set(range(1, L['t'] + 1)) -\ set([clustering_layer[l] + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'],\ order = 'C').sum(idx_to_sum) temp_class_l = np.argmax(psl_y, axis = 1) sil_l = silhouette_score(dm, temp_class_l, metric = 'precomputed') temp_classes.append(temp_class_l) #z_tail.append(Ezst_y[l].sum(1)) new_sil[l] = sil_l #z_tail = [] for l in range(L['t'] - 1): zl = Ezst_y[l].sum(1) z_tail.append(zl) if best_sil[l] < new_sil[l]: # Update the quantity if the silhouette score is better best_sil[l] = deepcopy(new_sil[l]) classes[l] = deepcopy(temp_classes[l]) if zl.shape[-1] == 3: plot_3d(zl, classes[l]) elif zl.shape[-1] == 2: plot_2d(zl, classes[l]) else: idx_to_sum = tuple(set(range(1, L['t'] + 1)) - set([clustering_layer + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'], order = 'C').sum(idx_to_sum) temp_classes = np.argmax(psl_y, axis = 1) try: new_sil = silhouette_score(dm, temp_classes, metric = 'precomputed') except: new_sil = -1 z_tail = [Ezst_y[l].sum(1) for l in range(L['t'] - 1)] if best_sil < new_sil: # Update the quantity if the silhouette score is better zl = z_tail[clustering_layer] best_sil = deepcopy(new_sil) classes = deepcopy(temp_classes) if zl.shape[-1] == 3: plot_3d(zl, classes) elif zl.shape[-1] == 2: plot_2d(zl, classes) # Refresh the likelihood if best if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 1, r_1L, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### min_nb_clusters = 2 is_not_min_specif = not(is_min_architecture_reached(k, r, min_nb_clusters)) if look_for_simpler_network(it_num) & perform_selec & is_not_min_specif: # To add: selection according to categ r_to_keep = r_select(y_bin, y_ord, y_categ, yc, zl1_ys_d,\ z2_z1s_d[:bar_L['d']], w_s_d, z2_z1s_c[:bar_L['c']], z2_z1s_c[bar_L['c']:], n_clusters) # Check layer deletion is_c_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['c']]) is_d_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['d']]) is_head_layer_deletion = np.any([is_c_layer_deletion, is_d_layer_deletion]) if is_head_layer_deletion: # Restart the algorithm if is_c_layer_deletion: r['c'] = [len(rl) for rl in r_to_keep['c'][:-1]] k['c'] = k['c'][:-1] if is_d_layer_deletion: r['d'] = [len(rl) for rl in r_to_keep['d'][:-1]] k['d'] = k['d'][:-1] init = dim_reduce_init(pd.DataFrame(y), n_clusters, k, r, nj, var_distrib,\ seed = None) eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) it_num = it_num + 1 print(likelihood) print('Restarting the algorithm') continue new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 # If r_l == 0, delete the last l + 1: layers new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 #w_s_t = pst_yCyD.mean(0) k_to_keep = k_select(w_s_c, w_s_d, w_s_t, k, new_Lt, clustering_layer, n_clusters) is_selection = check_if_selection(r_to_keep, r, k_to_keep, k, L, new_Lt) assert new_Lt > 0 # > 1 ? if n_clusters == 'multi': assert new_Lt == L['t'] if is_selection: # Part to change when update also number of layers on each head nb_deleted_layers_tail = L['t'] - new_Lt L['t'] = new_Lt L_1L = {keys: values - nb_deleted_layers_tail for keys, values in L_1L.items()} eta_c, eta_d, H_c, H_d, psi_c, psi_d = dgmm_coeff_selection(eta_c,\ H_c, psi_c, eta_d, H_d, psi_d, L, r_to_keep, k_to_keep) lambda_bin, lambda_ord, lambda_categ = gllvm_coeff_selection(lambda_bin, lambda_ord,\ lambda_categ, r, r_to_keep) w_s_c, w_s_d = path_proba_selection(w_s_c, w_s_d, k, k_to_keep, new_Lt) k = {h: [len(k_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} k['c'] = [len(k_to_keep['c'][l]) for l in range(L['c'] + 1)] r = {h: [len(r_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} r['c'] = [len(r_to_keep['c'][l]) for l in range(L['c'] + 1)] k_1L, _, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} patience = 0 best_r = deepcopy(r) best_k = deepcopy(k) #======================================================= # Identifiability conditions #======================================================= eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S_1L', S_1L) print("w_s_c", len(w_s_c)) print("w_s_d", len(w_s_d)) M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) print(likelihood) print('Silhouette score:', new_sil) it_num = it_num + 1 out = dict(likelihood = likelihood, classes = classes, \ best_r = best_r, best_k = best_k) if n_clusters == 'multi': out['z'] = z_tail else: out['z'] = z_tail[clustering_layer] return(out)
def M1DGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True,\ dm = [], max_patience = 1, use_silhouette = True):# dm small hack to remove ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing mixed variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not use_silhouette (Bool): If True use the silhouette as quality criterion (best for clustering) else use the likelihood (best for data augmentation). ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' prev_lik = - 1E16 best_lik = -1E16 best_sil = -1 new_sil = -1 tol = 0.01 patience = 0 is_looking_for_better_arch = False # Initialize the parameters eta = deepcopy(init['eta']) psi = deepcopy(init['psi']) lambda_bin = deepcopy(init['lambda_bin']) lambda_ord = deepcopy(init['lambda_ord']) lambda_cont = deepcopy(init['lambda_cont']) lambda_categ = deepcopy(init['lambda_categ']) H = deepcopy(init['H']) w_s = deepcopy(init['w_s']) # Probability of path s' through the network for all s' in Omega numobs = len(y) likelihood = [] silhouette = [] it_num = 0 ratio = 1000 np.random.seed = seed out = {} # Store the full output # Dispatch variables between categories y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')].astype(int) nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'].astype(int) nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) y_cont = y[:, var_distrib == 'continuous'].astype(float) nb_cont = y_cont.shape[1] # Set y_count standard error to 1 y_cont = y_cont / y_cont.std(axis = 0, keepdims = True) L = len(k) k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)]) M = M_growth(1, r, numobs) assert nb_bin + nb_ord + nb_cont + nb_categ > 0 if nb_bin + nb_ord + nb_cont + nb_categ != len(var_distrib): raise ValueError('Some variable types were not understood,\ existing types are: continuous, categorical,\ ordinal, binomial and bernoulli') # Compute the Gower matrix if len(dm) == 0: cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features = cat_features) # Do not stop the iterations if there are some iterations left or if the likelihood is increasing # or if we have not reached the maximum patience and if a new architecture was looked for # in the previous iteration while ((it_num < it) & (ratio > eps) & (patience <= max_patience)) | is_looking_for_better_arch: print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not(isnumeric(n_clusters)): if n_clusters == 'auto': clustering_layer = 0 else: raise ValueError('Please enter an int or "auto" for n_clusters') else: assert (np.array(k) == n_clusters).any() clustering_layer = np.argmax(np.array(k) == n_clusters) ##################################################################################### ################################# S step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for all s in Omega #===================================================================== mu_s, sigma_s = compute_path_params(eta, H, psi) sigma_s = ensure_psd(sigma_s) z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M) #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== chsi = compute_chsi(H, psi, mu_s, sigma_s) chsi = ensure_psd(chsi) rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s = draw_z2_z1s(chsi, rho, M, r) #======================================================================= # Compute the p(y| z1) for all variable categories #======================================================================= py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord, \ lambda_categ, y_categ, nj_categ, y_cont, lambda_cont, z_s[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys = draw_zl1_ys(z_s, py_zl1, M) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute conditional probabilities used in the appendix of asta paper #===================================================================== pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s, py_zl1) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S) pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s) #===================================================================== # Compute MFA expectations #===================================================================== Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \ E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute MFA Parameters #======================================================= w_s = np.mean(ps_y, axis = 0) eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H, k) #======================================================= # Identifiability conditions #======================================================= # Update eta, H and Psi values H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del(Ez) #======================================================= # Compute GLLVM Parameters #======================================================= lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_cont = cont_params_GLLVM(y_cont, lambda_cont, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(p_y)) likelihood.append(new_lik) silhouette.append(new_sil) ratio = abs((new_lik - prev_lik)/prev_lik) idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) psl_y = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum) temp_class = np.argmax(psl_y, axis = 1) try: new_sil = silhouette_score(dm, temp_class, metric = 'precomputed') except ValueError: new_sil = -1 # Store the params according to the silhouette or likelihood is_better = (best_sil < new_sil) if use_silhouette else (best_lik < new_lik) if is_better: z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1) best_sil = deepcopy(new_sil) classes = deepcopy(temp_class) ''' plt.figure(figsize=(8,8)) plt.scatter(z[:, 0], z[:, 1], c = classes) plt.show() ''' # Store the output out['classes'] = deepcopy(classes) out['best_z'] = deepcopy(z_s[0]) out['Ez.y'] = z out['best_k'] = deepcopy(k) out['best_r'] = deepcopy(r) out['best_w_s'] = deepcopy(w_s) out['lambda_bin'] = deepcopy(lambda_bin) out['lambda_ord'] = deepcopy(lambda_ord) out['lambda_categ'] = deepcopy(lambda_categ) out['lambda_cont'] = deepcopy(lambda_cont) out['eta'] = deepcopy(eta) out['mu'] = deepcopy(mu_s) out['sigma'] = deepcopy(sigma_s) out['psl_y'] = deepcopy(psl_y) out['ps_y'] = deepcopy(ps_y) # Refresh the classes only if they provide a better explanation of the data if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 2, r, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### min_nb_clusters = 2 if isnumeric(n_clusters): # To change when add multi mode is_not_min_specif = not(np.all(np.array(k) == n_clusters) & np.array_equal(r, [2,1])) else: is_not_min_specif = not(np.all(np.array(k) == min_nb_clusters) & np.array_equal(r, [2,1])) is_looking_for_better_arch = look_for_simpler_network(it_num) & perform_selec & is_not_min_specif if is_looking_for_better_arch: r_to_keep = r_select(y_bin, y_ord, y_categ, y_cont, zl1_ys, z2_z1s, w_s) # If r_l == 0, delete the last l + 1: layers new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1 k_to_keep = k_select(w_s, k, new_L, clustering_layer, not(isnumeric(n_clusters))) is_L_unchanged = (L == new_L) is_r_unchanged = np.all([len(r_to_keep[l]) == r[l] for l in range(new_L + 1)]) is_k_unchanged = np.all([len(k_to_keep[l]) == k[l] for l in range(new_L)]) is_selection = not(is_r_unchanged & is_k_unchanged & is_L_unchanged) assert new_L > 0 if is_selection: eta = [eta[l][k_to_keep[l]] for l in range(new_L)] eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][k_to_keep[l]] for l in range(new_L)] H = [H[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)] psi = [psi[l][k_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)] if nb_bin > 0: # Add the intercept: bin_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) lambda_bin = lambda_bin[:, bin_r_to_keep] if nb_ord > 0: # Intercept coefficients handling is a little more complicated here lambda_ord_intercept = [lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord] Lambda_ord_var = np.stack([lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord]) Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]] lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\ for j in range(nb_ord)] # To recheck if nb_cont > 0: # Add the intercept: cont_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) lambda_cont = lambda_cont[:, cont_r_to_keep] if nb_categ > 0: lambda_categ_intercept = [lambda_categ[j][:, 0] for j in range(nb_categ)] Lambda_categ_var = [lambda_categ_j[:,-r[0]:] for lambda_categ_j in lambda_categ] Lambda_categ_var = [lambda_categ_j[:, r_to_keep[0]] for lambda_categ_j in lambda_categ] lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\ for j in range(nb_categ)] w = w_s.reshape(*k, order = 'C') new_k_idx_grid = np.ix_(*k_to_keep[:new_L]) # If layer deletion, sum the last components of the paths if L > new_L: deleted_dims = tuple(range(L)[new_L:]) w_s = w[new_k_idx_grid].sum(deleted_dims).flatten(order = 'C') else: w_s = w[new_k_idx_grid].flatten(order = 'C') w_s /= w_s.sum() # Refresh the classes: TO RECHECK #idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) #ps_y_tmp = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum) #np.argmax(ps_y_tmp[:, k_to_keep[0]], axis = 1) k = [len(k_to_keep[l]) for l in range(new_L)] r = [len(r_to_keep[l]) for l in range(new_L + 1)] k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)]) L = new_L patience = 0 # Identifiability conditions H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del(Ez) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S',S) print("w_s", len(w_s)) prev_lik = deepcopy(new_lik) it_num = it_num + 1 print(likelihood) print(silhouette) out['likelihood'] = likelihood out['silhouette'] = silhouette return(out)
def dummies_gower(df): return pd.DataFrame(gower_matrix(df.applymap(str), cat_features = [True for v in df.columns])).set_index(videos_df.index)
metrics.adjusted_mutual_info_score, ] for artist_gt in range(ARTIST_GT): data = pd.read_csv( "../data preprocessing/final_df_with_encodings_with_price_binned.csv", header=0, usecols=[*features, *encodings]) filter_artist_gt(artist_gt) n_rows = len(data) mapping = {"(0, 250]": 0, "(250, 1250]": 1, "(1250, 4200]": 2} labels_true = [mapping[x] for x in data['price_binned']] results = [] for name, features_ in feature_groups.items(): result = [name] features_used = gower.gower_matrix(data[features_]) cluster = SpectralClustering(NUMBER_OF_PRICE_BINS, affinity='precomputed', n_init=200, n_jobs=-1).fit(features_used) labels_pred = cluster.labels_ result += [m(labels_true, labels_pred) for m in clustering_metrics] results.append(result) print( pd.DataFrame( results, columns=[ "Feature", "V-measure", "Adj. Rand Index" ]).sort_values(by='Adj. Rand Index', ascending=False).to_latex( f"../data preprocessing/artist_gt_{artist_gt}_ari.tex", index=False, caption=f"Number of rows {n_rows}"))
#y = y.where(~nan_mask, np.nan) nj, nj_bin, nj_ord, nj_categ = compute_nj(full_contra, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') p_new = full_contra.shape[1] # Feature category (cf) dtype = {full_contra.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p_new)} full_contra = full_contra.astype(dtype, copy=True) # Feature category (cf) cat_features = var_distrib == 'categorical' # Defining distance matrix dm3 = gower_matrix(full_contra, cat_features = cat_features) #===========================================# # Hyperparameters #===========================================# n_clusters = 2 nb_pobs = 100 # Target for pseudo observations r = np.array([2, 1]) numobs = len(full_contra) k = [n_clusters] seed = 1 init_seed = 2 eps = 1E-05
nj, nj_bin, nj_ord, nj_categ = compute_nj(y, var_distrib) y_np = y.values nb_cont = np.sum(var_distrib == 'continuous') p_new = y.shape[1] # Feature category (cf) cf_non_enc = np.logical_or(vd_categ_non_enc == 'categorical', vd_categ_non_enc == 'bernoulli') # Non encoded version of the dataset: y_nenc_typed = y_categ_non_enc.astype(np.object) y_np_nenc = y_nenc_typed.values # Defining distances over the non encoded features dm = gower_matrix(y_nenc_typed, cat_features=cf_non_enc) dtype = {y.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \ (var_distrib[j] != 'categorical') else np.str for j in range(p_new)} y = y.astype(dtype, copy=True) #===========================================# # Running the algorithm #===========================================# r = np.array([2, 1]) numobs = len(y) k = [n_clusters] seed = 1
s1 = '1' s2 = '1000' n1 = 1 n2 = 1000 types = { 'numbers': [n1, n2, 2, 2000], 'strings': [s1, s2, '2', '2000'], 'mixed': [n1, n2, s1, s2] } orders = { 'same': lambda a: [a, a], '25%': lambda a: [a, [a[0], a[1], a[2], a[2]]], '25% 2': lambda a: [a, [a[0], a[0], a[2], a[3]]], '50%': lambda a: [a, [a[0], a[1], a[0], a[1]]], '50% 2': lambda a: [a, [a[0], a[2], a[1], a[3]]], '50% 3': lambda a: [a, [a[3], a[1], a[2], a[0]]], '75%': lambda a: [a, [a[0], a[0], a[0], a[0]]], '75% 2': lambda a: [a, [a[1], a[0], a[3], a[3]]], '100%': lambda a: [a, [a[3], a[2], a[1], a[0]]] } for k, a in types.items(): print(k) for o, func in orders.items(): print(o) b = func(a) X = pd.DataFrame(b) X['c'] = 'c' print(X) print(gower.gower_matrix(X))
#print(set(discrete_k)) #test[col] = pd.cut(test[col], bins).map(lambda x: x.mid).astype(float) le.fit(test[col].append(train[col])) train[col] = le.transform(train[col]) k_dict[col] = deepcopy(le) nj, nj_bin, nj_ord, nj_categ = compute_nj(train, var_distrib) nb_cont = np.sum(var_distrib == 'continuous') p_new = train.shape[1] train_np = train.values # Defining distances over the features cat_features = pd.Series(var_distrib).isin( ['categorical', 'bernoulli']).to_list() dm = gower_matrix(train.astype(np.object), cat_features=cat_features) dtype = {train.columns[j]: np.float64 if (var_distrib[j] != 'bernoulli') and \ (var_distrib[j] != 'categorical') else np.str for j in range(p_new)} train = train.astype(dtype, copy=True) numobs = len(train) #***************************************************************** # Run MIAMI #***************************************************************** prince_init = dim_reduce_init(train, 2, k, r, nj, var_distrib, seed = None,\ use_famd=True) out = MIAMI(train_np, 'auto', r, k, prince_init, var_distrib, nj, authorized_ranges, nb_pobs, it,\ eps, maxstep, seed, perform_selec = False, dm = dm, max_patience = 0)