# Mu @ lambda binomial #====================================== stat_bin(out['lambda_bin'], out['mu'][0][0].T, nj_bin)[0] stat_bin(out['lambda_bin'], out['mu'][0][1].T, nj_bin)[0] #====================================== # Variables contribution #====================================== # !!! TO DO: A comparer avec la vraie matrice d'association # Vars contributions vc = vars_contributions(complete_y, out['Ez.y'], assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) s = cosine_similarity(vc, dense_output=True) vc2 = vars_contributions(completed_y2, out2['Ez.y'], assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) s2 = cosine_similarity(vc2, dense_output=True) vc_full = vars_contributions(full_contra, out_full['Ez.y'], assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) s_full = cosine_similarity(vc_full, dense_output=True)
def MIAMI(y, n_clusters, r, k, init, var_distrib, nj, authorized_ranges,\ target_nb_pseudo_obs = 500, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True,\ dm = [], max_patience = 1): # dm: Hack to remove ''' Complete the missing values using a trained M1DGMM y (numobs x p ndarray): The observations containing mixed variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable nan_mask (ndarray): A mask array equal to True when the observation value is missing False otherwise target_nb_pseudo_obs (int): The number of pseudo-observations to generate it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not dm (np array): The distance matrix of the observations. If not given M1DGMM computes it n_neighbors (int): The number of neighbors to use for NA imputation ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' # !!! Hack cols = y.columns # Formatting if not isinstance(y, np.ndarray): y = np.asarray(y) assert len(k) < 2 # Not implemented for deeper MDGMM for the moment out = M1DGMM(y, n_clusters, r, k, init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = perform_selec,\ dm = dm, max_patience = max_patience, use_silhouette = True) # Compute the associations vars_contributions(pd.DataFrame(y, columns = cols), out['Ez.y'], assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) # Upacking the model from the M1DGMM output p = y.shape[1] k = out['best_k'] r = out['best_r'] mu = out['mu'][0] sigma = out['sigma'][0] w = out['best_w_s'] #eta = out['eta'][0] #Ez_y = out['Ez.y'] lambda_bin = np.array(out['lambda_bin']) lambda_ord = out['lambda_ord'] lambda_categ = out['lambda_categ'] lambda_cont = np.array(out['lambda_cont']) nj_bin = nj[pd.Series(var_distrib).isin(['bernoulli', 'binomial'])].astype(int) nj_ord = nj[var_distrib == 'ordinal'].astype(int) nj_categ = nj[var_distrib == 'categorical'].astype(int) y_std = y[:,var_distrib == 'continuous'].astype(float).std(axis = 0,\ keepdims = True) nb_points = 200 # Bloc de contraintes ''' is_constrained = np.isfinite(authorized_ranges).any(1)[0] is_min_constrained = np.isfinite(authorized_ranges[0])[0] is_max_constrained = np.isfinite(authorized_ranges[1])[0] is_continuous = (var_distrib == 'continuous') | (var_distrib == 'binomial') min_unconstrained_cont = is_continuous & ~is_min_constrained max_unconstrained_cont = is_continuous & ~is_max_constrained authorized_ranges[0] = np.where(min_unconstrained_cont, np.min(y, 0), authorized_ranges[0]) authorized_ranges[1] = np.where(max_unconstrained_cont, np.max(y, 0), authorized_ranges[1]) ''' #from scipy.stats import norm ''' #============================================== # Constraints determination #============================================== # Force to stay in the support for binomial and continuous variables #authorized_ranges = np.expand_dims(np.stack([[-np.inf,np.inf] for var in var_distrib]).T, 1) #authorized_ranges[:, 0, 8] = [0, 0] # Of more than 60 years old #authorized_ranges[:, 0, 0] = [-np.inf, np.inf] # Of more than 60 years old # Look for the constrained variables #authorized_ranges[:,:,0] = np.array([[-np.inf],[np.inf]]) is_constrained = np.isfinite(authorized_ranges).any(1)[0] #bbox = np.dstack([Ez_y.min(0),Ez_y.max(0)]) #bbox * np.array([0.6, 1.4]) proba_min = 1E-3 proba = proba_min epsilon = 1E-12 best_A = [] best_b = [] is_solution = True while is_solution: b = []#np.array([]) A = []#np.array([[]]).reshape((0, r[0])) bbox = np.array([[-10, 10]] * r[0]) # !!! A corriger alpha = 1 - proba q = norm.ppf(1 - alpha / 2) #========================================= # Store the constraints for each datatype #========================================= for j in range(p): if is_constrained[j]: bounds_j = authorized_ranges[:,:,j] # The index of the variable among the variables of the same type idx_among_type = (var_distrib[:j] == var_distrib[j]).sum() if var_distrib[j] == 'continuous': # Lower bound lb_j = bounds_j[0] / y_std[0, idx_among_type] - lambda_cont[idx_among_type, 0] + q A.append(- lambda_cont[idx_among_type,1:]) b.append(- lb_j) # Upper bound ub_j = bounds_j[1] / y_std[0, idx_among_type] - lambda_cont[idx_among_type, 0] - q A.append(lambda_cont[idx_among_type,1:]) b.append(ub_j) elif var_distrib[j] == 'binomial': idx_among_type = ((var_distrib[:j] == 'bernoulli') | (var_distrib[:j] == 'binomial')).sum() # Lower bound lb_j = bounds_j[0] lb_j = logit(lb_j / nj_bin[idx_among_type]) - lambda_bin[idx_among_type,0] A.append(- lambda_bin[idx_among_type,1:]) b.append(- lb_j) # Upper bound ub_j = bounds_j[1] ub_j = logit(ub_j / nj_bin[idx_among_type]) - lambda_bin[idx_among_type,0] A.append(lambda_bin[idx_among_type, 1:]) b.append(ub_j) elif var_distrib[j] == 'bernoulli': idx_among_type = ((var_distrib[:j] == 'bernoulli') | (var_distrib[:j] == 'binomial')).sum() assert bounds_j[0] == bounds_j[1] # !!! To improve # Lower bound lb_j = proba if bounds_j[0] == 1 else 0 + epsilon lb_j = logit(lb_j / nj_bin[idx_among_type]) - lambda_bin[idx_among_type,0] A.append(- lambda_bin[idx_among_type,1:]) b.append(- lb_j) # Upper bound ub_j = 1 - epsilon if bounds_j[0] == 1 else 1 - proba ub_j = logit(ub_j / nj_bin[idx_among_type]) - lambda_bin[idx_among_type,0] A.append(lambda_bin[idx_among_type, 1:]) b.append(ub_j) elif var_distrib[j] == 'categorical': continue assert bounds_j[0] == bounds_j[1] # !!! To improve modality_idx = int(bounds_j[0][0]) # Define the probability to draw the modality of interest to proba pi = np.full(nj_categ[idx_among_type],\ (1 - proba) / (nj_categ[idx_among_type] - 1)) # For the inversion of the softmax a constant C = 0 is taken: pi[modality_idx] = proba lb_j = np.log(pi) - lambda_categ[idx_among_type][:, 0] # -1 Mask mask = np.ones((nj_categ[idx_among_type], 1)) mask[modality_idx] = -1 A.append(lambda_categ[idx_among_type][:, 1:] * mask) b.append(lb_j * mask[:,0]) elif var_distrib[j] == 'ordinal': assert bounds_j[0] == bounds_j[1] # !!! To improve modality_idx = int(bounds_j[0][0]) RuntimeError('Not implemented for the moment') #========================================= # Try if the solution is feasible #========================================= try: points, interior_point, hs = solve_convex_set(np.reshape(A, (-1, r[0]),\ order = 'C'), np.hstack(b), bbox) # If yes store the new constraints best_A = deepcopy(A) best_b = deepcopy(b) proba = np.min([1.05 * proba, 0.8]) if proba >= 0.8: is_solution = False except QhullError: is_solution = False best_A = np.reshape(best_A, (-1, r[0]), order = 'C') best_b = np.hstack(best_b) points, interior_point, hs = solve_convex_set(best_A, best_b, bbox) polygon = Polygon(points) ''' #======================================================= # Data augmentation part #======================================================= # Create pseudo-observations iteratively: nb_pseudo_obs = 0 y_new_all = [] zz = [] total_nb_obs_generated = 0 while nb_pseudo_obs <= target_nb_pseudo_obs: #=================================================== # Generate a batch of latent variables (try) #=================================================== ''' # Simulate points in the Polynom pts = generate_random(nb_points, polygon) pts = np.array([np.array([p.x, p.y]) for p in pts]) # Compute their density and resample them pts_density = fz(pts, mu, sigma, w) pts_density = pts_density / pts_density.sum(keepdims = True) # Normalized the pdfs idx = np.random.choice(np.arange(nb_points), size = target_nb_pseudo_obs,\ p = pts_density, replace=True) z = pts[idx] ''' #=================================================== # Generate a batch of latent variables #=================================================== # Draw some z^{(1)} | Theta using z^{(1)} | s, Theta z = np.zeros((nb_points, r[0])) z0_s = multivariate_normal(size = (nb_points, 1), \ mean = mu.flatten(order = 'C'), cov = block_diag(*sigma)) z0_s = z0_s.reshape(nb_points, k[0], r[0], order='C') comp_chosen = np.random.choice(k[0], nb_points, p=w / w.sum()) for m in range(nb_points): # Dirty loop for the moment z[m] = z0_s[m, comp_chosen[m]] #=================================================== # Draw pseudo-observations #=================================================== y_bin_new = [] y_categ_new = [] y_ord_new = [] y_cont_new = [] y_bin_new.append(draw_new_bin(lambda_bin, z, nj_bin)) y_categ_new.append(draw_new_categ(lambda_categ, z, nj_categ)) y_ord_new.append(draw_new_ord(lambda_ord, z, nj_ord)) y_cont_new.append(draw_new_cont(lambda_cont, z)) # Stack the quantities y_bin_new = np.vstack(y_bin_new) y_categ_new = np.vstack(y_categ_new) y_ord_new = np.vstack(y_ord_new) y_cont_new = np.vstack(y_cont_new) # "Destandardize" the continous data y_cont_new = y_cont_new * y_std # Put them in the right order and append them to y type_counter = {'count': 0, 'ordinal': 0,\ 'categorical': 0, 'continuous': 0} y_new = np.full((nb_points, y.shape[1]), np.nan) # Quite dirty: for j, var in enumerate(var_distrib): if (var == 'bernoulli') or (var == 'binomial'): y_new[:, j] = y_bin_new[:, type_counter['count']] type_counter['count'] = type_counter['count'] + 1 elif var == 'ordinal': y_new[:, j] = y_ord_new[:, type_counter[var]] type_counter[var] = type_counter[var] + 1 elif var == 'categorical': y_new[:, j] = y_categ_new[:, type_counter[var]] type_counter[var] = type_counter[var] + 1 elif var == 'continuous': y_new[:, j] = y_cont_new[:, type_counter[var]] type_counter[var] = type_counter[var] + 1 else: raise ValueError(var, 'Type not implemented') #=================================================== # Acceptation rule #=================================================== # Check that each variable is in the good range y_new_exp = np.expand_dims(y_new, 1) total_nb_obs_generated += len(y_new) mask = np.logical_and(y_new_exp >= authorized_ranges[0][np.newaxis],\ y_new_exp <= authorized_ranges[1][np.newaxis]) # Keep an observation if it lies at least into one of the ranges possibility mask = np.any(mask.mean(2) == 1, axis=1) y_new = y_new[mask] y_new_all.append(y_new) nb_pseudo_obs = len(np.concatenate(y_new_all)) zz.append(z[mask]) #print(nb_pseudo_obs) # Keep target_nb_pseudo_obs pseudo-observations y_new_all = np.concatenate(y_new_all) y_new_all = y_new_all[:target_nb_pseudo_obs] #y_all = np.vstack([y, y_new_all]) share_kept_pseudo_obs = len(y_new_all) / total_nb_obs_generated out['zz'] = zz out['y_all'] = y_new_all out['share_kept_pseudo_obs'] = share_kept_pseudo_obs return (out) '''
out = M1DGMM(y_np, 'auto', r, k, prince_init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = False) m, pred = misc(labels_oh, out['classes'], True) print(m) print(confusion_matrix(labels_oh, pred)) print(silhouette_score(dm, pred, metric='precomputed')) # Plot of the latent representation of the observations and contributions of the variables y.columns = ['age', 'sex', 'cp' ,'trestbps', 'chol', 'fbs', 'restecg', 'thalach',\ 'exang', 'oldpeak', 'slope', 'ca', 'thal'] obs_representation(out['classes'], out['Ez.ys'], title='Latent representation of the observations') vars_contributions(y, out['Ez.ys'], assoc_thr=0.0) density_representation(out, is_3D=False) # Plot the final groups import matplotlib import matplotlib.pyplot as plt import numpy as np colors = ['green', 'red'] fig = plt.figure(figsize=(8, 8)) plt.scatter(out['Ez.ys'][:, 0], out['Ez.ys'][:, 1], c=pred,\ cmap=matplotlib.colors.ListedColormap(colors)) cb = plt.colorbar()
## Look for the z and y mapping complete_y = complete_y.reset_index(drop=True) zz = out['Ez.y'] ## Individual variables var = 'WifeRelig' fig, ax = plt.subplots() for g in np.unique(complete_y[var]): ix = np.where(complete_y[var] == g) ax.scatter(zz[ix, 0], zz[ix, 1], label=g, s=7) ax.legend() ax.set_title(var + ' zz') plt.show() comb = (complete_y['WifeEduc'] == 3.0) & (complete_y['WifeRelig'] == 1.0) & ( complete_y['HusbEduc'] == 3.0) fig, ax = plt.subplots() for g in np.unique(comb): ix = np.where(comb == g) ax.scatter(zz[ix, 0], zz[ix, 1], label=g, s=7) ax.legend() ax.set_title('WifeEduc == 3 & HusbEduc == 3 & WifeRelig == 1') plt.show() plt.scatter(zz[:, 0], zz[:, 1], c=complete_y['HusbEduc'].astype(float)) vars_contributions(full_contra, zz, assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None)
#============================= # Comparing associations structure #============================= import seaborn as sns from dython.nominal import compute_associations, associations from sklearn.metrics.pairwise import cosine_similarity original_assoc = compute_associations(full_pima, nominal_columns = cat_features) associations(full_pima, nominal_columns = cat_features) Ez = out2['Ez.y'] vc = vars_contributions(completed_y2, Ez, assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) assoc = cosine_similarity(vc, dense_output=True) labels = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'D.P. Function', 'Age', 'Outcome'] fig, axn = plt.subplots(1, 2, sharex=True, sharey=True, figsize = (12,10)) cbar_ax = fig.add_axes([.91, .3, .03, .4]) sns.heatmap(original_assoc.abs(), ax=axn[0], cbar=0 == 0,
def MI2AMI(y, n_clusters, r, k, init, var_distrib, nj,\ nan_mask, target_nb_pseudo_obs = 500, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True,\ dm = [], max_patience = 1): # dm: Hack to remove ''' Complete the missing values using a trained M1DGMM y (numobs x p ndarray): The observations containing mixed variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable nan_mask (ndarray): A mask array equal to True when the observation value is missing False otherwise target_nb_pseudo_obs (int): The number of pseudo-observations to generate it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not dm (np array): The distance matrix of the observations. If not given M1DGMM computes it n_neighbors (int): The number of neighbors to use for NA imputation ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' # !!! Hack cols = y.columns # Formatting if not isinstance(nan_mask, np.ndarray): nan_mask = np.asarray(nan_mask) if not isinstance(y, np.ndarray): y = np.asarray(y) assert len(k) < 2 # Not implemented for deeper MDGMM for the moment # Keep complete observations complete_y = y[~np.isnan(y.astype(float)).any(1)] completed_y = deepcopy(y) out = M1DGMM(complete_y, 'auto', r, k, init, var_distrib, nj, it,\ eps, maxstep, seed, perform_selec = perform_selec,\ dm = dm, max_patience = max_patience, use_silhouette = True) # Compute the associations vc = vars_contributions(pd.DataFrame(complete_y, columns = cols), out['Ez.y'], assoc_thr = 0.0, \ title = 'Contribution of the variables to the latent dimensions',\ storage_path = None) # Upacking the model from the M1DGMM output #p = y.shape[1] k = out['best_k'] r = out['best_r'] mu = out['mu'][0] lambda_bin = np.array(out['lambda_bin']) lambda_ord = out['lambda_ord'] lambda_categ = out['lambda_categ'] lambda_cont = np.array(out['lambda_cont']) nj_bin = nj[pd.Series(var_distrib).isin(['bernoulli', 'binomial'])].astype(int) nj_ord = nj[var_distrib == 'ordinal'].astype(int) nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_cont = np.sum(var_distrib == 'continuous') nb_bin = np.sum(var_distrib == 'binomial') y_std = complete_y[:,var_distrib == 'continuous'].astype(float).std(axis = 0,\ keepdims = True) cat_features = var_distrib != 'categorical' # Compute the associations between variables and use them as weights for the optimisation assoc = cosine_similarity(vc, dense_output=True) np.fill_diagonal(assoc, 0.0) assoc = np.abs(assoc) weights = (assoc / assoc.sum(1, keepdims=True)) #============================================== # Optimisation sandbox #============================================== # Define the observation generated by the center of each cluster cluster_obs = [impute(mu[kk,:,0], var_distrib, lambda_bin, nj_bin, lambda_categ, nj_categ,\ lambda_ord, nj_ord, lambda_cont, y_std) for kk in range(k[0])] # Use only of the observed variables as references types = {'bin': ['bernoulli', 'binomial'], 'categ': ['categorical'],\ 'cont': ['continuous'], 'ord': 'ordinal'} # Gradient optimisation nan_indices = np.where(nan_mask.any(1))[0] imputed_y = np.zeros_like(y) numobs = y.shape[0] #************************************ # Linear constraint to stay in the support of continuous variables #************************************ lb = np.array([]) ub = np.array([]) A = np.array([[]]).reshape((0, r[0])) if nb_bin > 0: ## Corrected Binomial bounds (ub is actually +inf) bin_indices = var_distrib[np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')] binomial_indices = bin_indices == 'binomial' lb_bin = np.nanmin(y[:, var_distrib == 'binomial'], 0) lb_bin = logit( lb_bin / nj_bin[binomial_indices]) - lambda_bin[binomial_indices, 0] ub_bin = np.nanmax(y[:, var_distrib == 'binomial'], 0) ub_bin = logit( ub_bin / nj_bin[binomial_indices]) - lambda_bin[binomial_indices, 0] A_bin = lambda_bin[binomial_indices, 1:] ## Concatenate the constraints lb = np.concatenate([lb, lb_bin]) ub = np.concatenate([ub, ub_bin]) A = np.concatenate([A, A_bin], axis=0) if nb_cont > 0: ## Corrected Gaussian bounds lb_cont = np.nanmin(y[:, var_distrib == 'continuous'], 0) / y_std[0] - lambda_cont[:, 0] ub_cont = np.nanmax(y[:, var_distrib == 'continuous'], 0) / y_std[0] - lambda_cont[:, 0] A_cont = lambda_cont[:, 1:] ## Concatenate the constraints lb = np.concatenate([lb, lb_cont]) ub = np.concatenate([ub, ub_cont]) A = np.concatenate([A, A_cont], axis=0) lc = LinearConstraint(A, lb, ub, keep_feasible=True) zz = [] fun = [] for i in range(numobs): if i in nan_indices: # Design the nan masks for the optimisation process nan_mask_i = nan_mask[i] weights_i = weights[nan_mask_i].mean(0) # Look for the best starting point cluster_dist = [error(y[i, ~nan_mask_i], obs[~nan_mask_i],\ cat_features[~nan_mask_i], weights_i)\ for obs in cluster_obs] z02 = mu[np.argmin(cluster_dist), :, 0] # Formatting vars_i = {type_alias: np.where(~nan_mask_i[np.isin(var_distrib, vartype)])[0] \ for type_alias, vartype in types.items()} complete_categ = [ l for idx, l in enumerate(lambda_categ) if idx in vars_i['categ'] ] complete_ord = [ l for idx, l in enumerate(lambda_ord) if idx in vars_i['ord'] ] opt = minimize(stat_all, z02, \ args = (y[i, ~nan_mask_i], var_distrib[~nan_mask_i],\ weights_i[~nan_mask_i],\ lambda_bin[vars_i['bin']], nj_bin[vars_i['bin']],\ complete_categ,\ nj_categ[vars_i['categ']],\ complete_ord,\ nj_ord[vars_i['ord']],\ lambda_cont[vars_i['cont']], y_std[:, vars_i['cont']]), tol = eps, method='trust-constr', jac = grad_stat,\ constraints = lc, options = {'maxiter': 1000}) z = opt.x zz.append(z) fun.append(opt.fun) imputed_y[i] = impute(z, var_distrib, lambda_bin, nj_bin, lambda_categ, nj_categ,\ lambda_ord, nj_ord, lambda_cont, y_std) else: imputed_y[i] = y[i] completed_y = np.where(nan_mask, imputed_y, y) out['completed_y'] = completed_y out['zz'] = zz out['fun'] = fun return (out)