def compute_chsi(H, psi, mu_s, sigma_s): ''' Compute chsi as defined in equation (8) of the DGMM paper H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters for each layer psi (list of nb_layers elements of shape (K_l x r_l-1, r_l-1)): Psi parameters for each layer mu_s (list of nd-arrays): The means of the Gaussians starting at each layer sigma_s (list of nd-arrays): The covariance matrices of the Gaussians starting at each layer ------------------------------------------------------------------------------------------------ returns (list of ndarray): The chsi parameters for all paths starting at each layer ''' L = len(H) k = [len(h) for h in H] #===================================================================== # Initiating the parameters for all layers #===================================================================== # Initialization with the parameters of the last layer chsi = [0 for i in range(L)] chsi[-1] = pinv(pinv(sigma_s[-1]) + t(H[-1], (0, 2, 1)) @ pinv(psi[-1]) @ H[-1]) #================================================================================== # Compute chsi from top to bottom #================================================================================== for l in range(L - 1): Ht_psi_H = t(H[l], (0, 2, 1)) @ pinv(psi[l]) @ H[l] Ht_psi_H = np.repeat(Ht_psi_H, np.prod(k[l + 1:]), axis = 0) sigma_next_l = np.tile(sigma_s[l + 1], (k[l], 1, 1)) chsi[l] = pinv(pinv(sigma_next_l) + Ht_psi_H) return chsi
def draw_z_s(mu_s, sigma_s, eta, M): ''' Draw from f(z^{l} | s) for all s in Omega and return the centered and non-centered draws mu_s (list of nd-arrays): The means of the Gaussians starting at each layer sigma_s (list of nd-arrays): The covariance matrices of the Gaussians starting at each layer eta (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters for each layer M (list of int): The number of MC to draw on each layer ------------------------------------------------------------------------- returns (list of ndarrays): z^{l} | s for all s in Omega and all l in L ''' L = len(mu_s) - 1 r = [mu_s[l].shape[1] for l in range(L + 1)] S = [mu_s[l].shape[0] for l in range(L + 1)] z_s = [] zc_s = [] # z centered (denoted c) or all l for l in range(L + 1): zl_s = multivariate_normal(size = (M[l], 1), \ mean = mu_s[l].flatten(order = 'C'), cov = block_diag(*sigma_s[l])) zl_s = zl_s.reshape(M[l], S[l], r[l], order='C') z_s.append(t(zl_s, (0, 2, 1))) if l < L: # The last layer is already centered eta_ = np.repeat(t(eta[l], (2, 0, 1)), S[l + 1], axis=1) zc_s.append(zl_s - eta_) return z_s, zc_s
def identifiable_estim_DGMM(eta_old, H_old, psi_old, Ez, AT): ''' Enforce identifiability conditions for DGMM estimators eta_old (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu estimators of the previous iteration for each layer H_old (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda estimators of the previous iteration for each layer psi_old (list of nb_layers elements of shape (K_l x r_l-1, r_l-1)): Psi estimators of the previous iteration for each layer Ez1 (list of ndarrays): E(z^{(l)}) for all l AT (list of ndarrays): Var(z^{(1)})^{-1/2 T} for all l ------------------------------------------------------------------------- returns (tuple of length 3): "identifiable" estimators of eta, Lambda and Psi (1st condition) ''' L = len(eta_old) eta_new = [[] for l in range(L)] H_new = [[] for l in range(L)] psi_new = [[] for l in range(L)] for l in reversed(range(L)): inv_AT = pinv(AT[l]) # Identifiability psi_new[l] = inv_AT @ psi_old[l] @ t(inv_AT, (0, 2, 1)) H_new[l] = inv_AT @ H_old[l] eta_new[l] = inv_AT @ (eta_old[l] - Ez[l]) return eta_new, H_new, psi_new
def compute_rho(eta, H, psi, mu_s, sigma_s, z_c, chsi): ''' Compute rho as defined in equation (8) of the DGMM paper eta (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters for each layer H (list of nb_layers elements of shape (K_l x r_{l-1}, r_l)): Lambda parameters for each layer psi (list of nb_layers elements of shape (K_l x r_{l-1}, r_{l-1})): Psi parameters for each layer z_c (list of nd-arrays) z^{(l)} - eta^{(l)} for each layer. chsi (list of nd-arrays): The chsi parameters for each layer ----------------------------------------------------------------------- returns (list of ndarrays): The rho parameters (covariance matrices) for all paths starting at each layer ''' L = len(H) rho = [0 for i in range(L)] k = [len(h) for h in H] k_aug = k + [1] for l in range(0, L): sigma_next_l = np.tile(sigma_s[l + 1], (k[l], 1, 1)) mu_next_l = np.tile(mu_s[l + 1], (k[l], 1, 1)) HxPsi_inv = t(H[l], (0, 2, 1)) @ pinv(psi[l]) HxPsi_inv = np.repeat(HxPsi_inv, np.prod(k_aug[l + 1: ]), axis = 0) rho[l] = chsi[l][n_axis] @ (HxPsi_inv[n_axis] @ z_c[l][..., n_axis] \ + (pinv(sigma_next_l) @ mu_next_l)[n_axis]) return rho
def draw_z2_z1s(chsi, rho, M, r): ''' Draw from f(z^{l+1} | z^{l}, s, Theta) chsi (list of nd-arrays): The chsi parameters for all paths starting at each layer rho (list of ndarrays): The rho parameters (covariance matrices) for all paths starting at each layer M (list of int): The number of MC to draw on each layer r (list of int): The dimension of each layer --------------------------------------------------------------------------- returns (list of nd-arrays): z^{l+1} | z^{l}, s, Theta for all (l,s) ''' L = len(chsi) S = [chsi[l].shape[0] for l in range(L)] z2_z1s = [] for l in range(L): z2_z1s_l = np.zeros((M[l + 1], M[l], S[l], r[l + 1])) for s in range(S[l]): z2_z1s_kl = multivariate_normal(size = M[l + 1], \ mean = rho[l][:,s].flatten(order = 'C'), \ cov = block_diag(*np.repeat(chsi[l][s][n_axis], M[l], axis = 0))) z2_z1s_l[:, :, s] = z2_z1s_kl.reshape(M[l + 1], M[l], r[l + 1], order='C') z2_z1s_l = t(z2_z1s_l, (1, 0, 2, 3)) z2_z1s.append(z2_z1s_l) return z2_z1s
def identifiable_estim_DDGMM(eta_old, H_old, psi_old, Ez, AT): ''' Ensure that the latent variables are centered reduced (1st DGMM identifiability condition) eta_old (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu estimators of the previous iteration for each layer H_old (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda estimators of the previous iteration for each layer psi_old (list of nb_layers elements of shape (K_l x r_l-1, r_l-1)): Psi estimators of the previous iteration for each layer Ez1 (list of (k_l, r_l) ndarray): E(z^{(l)}) AT (list of (k_l, k_l) ndarray): Var(z^{(l)})^{-1/2 T} ------------------------------------------------------------------------- returns (tuple of length 3): "DDGMM identifiable" estimators of eta, Lambda and Psi ''' L = len(eta_old) eta_new = [[] for l in range(L)] H_new = [[] for l in range(L)] psi_new = [[] for l in range(L)] for l in reversed(range(L)): inv_AT = pinv(AT[l]) # Identifiability psi_new[l] = inv_AT @ psi_old[l] @ t(inv_AT, (0, 2, 1)) H_new[l] = inv_AT @ H_old[l] eta_new[l] = inv_AT @ (eta_old[l] - Ez[l]) return eta_new, H_new, psi_new
def linorm2D(S, nit): """Power iteration Estimates the inverse of the Lipschitz constant of a matrix SS.T Parameters ---------- A: array operator for which we seek the lipschitz constant nit: int maximum number of iterations Returns ------- xn: float inverse of the Lipschitz constant of SS.T """ n1, n2 = np.shape(S) x0 = np.random.rand(1, n1) x0 = x0 / np.sqrt(np.sum(x0**2)) for i in range(nit): x = np.dot(x0, S) xn = np.sqrt(np.sum(x**2)) xp = x / xn y = np.dot(xp, S.T) yn = np.sqrt(np.sum(y**2)) if yn < np.dot(y, np.t(x0)): break x0 = y / yn return 1. / xn
def compute_z_moments(w_s, eta_old, H_old, psi_old): ''' Compute the first moment and the variance of the latent variable w_s (list of length s1): The path probabilities for all s in S1 eta_old (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): eta estimators of the previous iteration for each layer H_old (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda estimators of the previous iteration for each layer psi_old (list of nb_layers elements of shape (K_l x r_l-1, r_l-1)): Psi estimators of the previous iteration for each layer ------------------------------------------------------------------------- returns (tuple of length 2): E(z^{(l)}) and Var(z^{(l)}) ''' k = [eta.shape[0] for eta in eta_old] L = len(eta_old) Ez = [[] for l in range(L)] AT = [[] for l in range(L)] w_reshaped = w_s.reshape(*k, order='C') for l in reversed(range(L)): # Compute E(z^{(l)}) idx_to_sum = tuple(set(range(L)) - set([l])) wl = w_reshaped.sum(idx_to_sum)[..., n_axis, n_axis] Ezl = (wl * eta_old[l]).sum(0, keepdims=True) Ez[l] = Ezl etaTeta = eta_old[l] @ t(eta_old[l], (0, 2, 1)) HlHlT = H_old[l] @ t(H_old[l], (0, 2, 1)) E_zlzlT = (wl * (HlHlT + psi_old[l] + etaTeta)).sum(0, keepdims=True) var_zl = E_zlzlT - Ezl @ t(Ezl, (0, 2, 1)) try: var_zl = ensure_psd([var_zl])[0] # Numeric stability check except: print(var_zl) raise RuntimeError('Var z1 was not psd') AT_l = cholesky(var_zl) AT[l] = AT_l return Ez, AT
def compute_path_params(eta, H, psi): ''' Compute the gaussian parameters for each path H (list of nb_layers elements of shape (K_l x r_{l-1}, r_l)): Lambda parameters for each layer psi (list of nb_layers elements of shape (K_l x r_{l-1}, r_{l-1})): Psi parameters for each layer eta (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters for each layer ------------------------------------------------------------------------------------------------ returns (tuple of len 2): The updated parameters mu_s and sigma for all s in Omega ''' #===================================================================== # Retrieving model parameters #===================================================================== L = len(H) k = [len(h) for h in H] k_aug = k + [ 1 ] # Integrating the number of components of the last layer i.e 1 r1 = H[0].shape[1] r2_L = [h.shape[2] for h in H] # r[2:L] r = [r1] + r2_L # r augmented #===================================================================== # Initiating the parameters for all layers #===================================================================== mu_s = [0 for i in range(L + 1)] sigma_s = [0 for i in range(L + 1)] # Initialization with the parameters of the last layer mu_s[-1] = np.zeros((1, r[-1], 1)) # Inverser k et r plus tard sigma_s[-1] = np.eye(r[-1])[n_axis] #================================================================================== # Compute Gaussian parameters from top to bottom for each path #================================================================================== for l in reversed(range(0, L)): H_repeat = np.repeat(H[l], np.prod(k_aug[l + 1:]), axis=0) eta_repeat = np.repeat(eta[l], np.prod(k_aug[l + 1:]), axis=0) psi_repeat = np.repeat(psi[l], np.prod(k_aug[l + 1:]), axis=0) mu_s[l] = eta_repeat + H_repeat @ np.tile(mu_s[l + 1], (k[l], 1, 1)) sigma_s[l] = H_repeat @ np.tile(sigma_s[l + 1], (k[l], 1, 1)) @ t(H_repeat, (0, 2, 1)) \ + psi_repeat return mu_s, sigma_s
def draw_z_s_all_network(mu_s_c, sigma_s_c, mu_s_d, sigma_s_d, yc, eta_c, \ eta_d, S_1L, L, M): ''' Draw z^{(l)h} from both heads and then from the tail mu_s_* (list of nd-arrays): The means of the Gaussians starting at each layer of the head * sigma_s_* (list of nd-arrays): The covariance matrices of the Gaussians of the head * yc (n x p_continuous): The continuous data eta_* (list of nb_layers elements of shape (K_l x r_{l-1}, 1)): mu parameters for each layer of head * S_1L (dict): The number of paths starting at each layer L (dict): The number of layers on each head or tail M (dict): The number of MC points to draw on each head or tail -------------------------------------------------------------------------- returns (tuple of length 4): The latent variables existing in the network ''' #============================ # Continuous head. #============================ # The first z for the continuous head is actually the data, # which we do not resimulate z_s_c, zc_s_c = draw_z_s(mu_s_c[1:], sigma_s_c[1:],\ eta_c[1:], M['c'][1:], False) yc_rep = np.repeat(yc[..., n_axis], S_1L['c'][0], -1) z_s_c = [yc_rep] + z_s_c eta_rep = np.repeat(eta_c[0], S_1L['c'][1], axis=0) yc_centered_rep = t(z_s_c[0] - eta_rep.T, (0, 2, 1)) zc_s_c = [yc_centered_rep] + zc_s_c #============================ # Discrete head. #============================ z_s_d, zc_s_d = draw_z_s(mu_s_d[:L['d']], sigma_s_d[:L['d']],\ eta_d[:L['d']], M['d'][:L['d']], True) #============================ # Common tail #============================ # The samples of the tail are shared by both heads z_s_d = z_s_d + z_s_c[(L['c'] + 1):] zc_s_d = zc_s_d + zc_s_c[(L['c'] + 1):] return z_s_c, zc_s_c, z_s_d, zc_s_d
def log_py_zM_cont_j(lambda_cont_j, y_cont_j, zM, k): ''' Compute log p(y_j | zM, s1 = k1) of the jth continuous variable lambda_cont_j ( (r + 1) 1darray): Coefficients of the continuous distributions in the GLLVM layer y_cont_j (numobs 1darray): The subset containing only the continuous variables in the dataset zM (M x r x k ndarray): M Monte Carlo copies of z for each component k1 of the mixture k (int): The number of components of the mixture -------------------------------------------------------------- returns (ndarray): p(y_j | zM, s1 = k1) ''' r = zM.shape[1] M = zM.shape[0] yg = np.repeat(y_cont_j[np.newaxis], axis=0, repeats=M) yg = np.expand_dims(yg, 1) eta = np.transpose(zM, (0, 2, 1)) @ lambda_cont_j[1:].reshape( 1, r, 1, order='C') eta = eta + lambda_cont_j[0].reshape(1, 1, 1) # Add the constant return t(-0.5 * (np.log(2 * np.pi) + (yg - eta)**2), (0, 2, 1))
def M1DGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True,\ dm = [], max_patience = 1, use_silhouette = True):# dm small hack to remove ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing mixed variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not use_silhouette (Bool): If True use the silhouette as quality criterion (best for clustering) else use the likelihood (best for data augmentation). ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' prev_lik = - 1E16 best_lik = -1E16 best_sil = -1 new_sil = -1 tol = 0.01 patience = 0 is_looking_for_better_arch = False # Initialize the parameters eta = deepcopy(init['eta']) psi = deepcopy(init['psi']) lambda_bin = deepcopy(init['lambda_bin']) lambda_ord = deepcopy(init['lambda_ord']) lambda_cont = deepcopy(init['lambda_cont']) lambda_categ = deepcopy(init['lambda_categ']) H = deepcopy(init['H']) w_s = deepcopy(init['w_s']) # Probability of path s' through the network for all s' in Omega numobs = len(y) likelihood = [] silhouette = [] it_num = 0 ratio = 1000 np.random.seed = seed out = {} # Store the full output # Dispatch variables between categories y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',var_distrib == 'binomial')].astype(int) nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'].astype(int) nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) y_cont = y[:, var_distrib == 'continuous'].astype(float) nb_cont = y_cont.shape[1] # Set y_count standard error to 1 y_cont = y_cont / y_cont.std(axis = 0, keepdims = True) L = len(k) k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)]) M = M_growth(1, r, numobs) assert nb_bin + nb_ord + nb_cont + nb_categ > 0 if nb_bin + nb_ord + nb_cont + nb_categ != len(var_distrib): raise ValueError('Some variable types were not understood,\ existing types are: continuous, categorical,\ ordinal, binomial and bernoulli') # Compute the Gower matrix if len(dm) == 0: cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features = cat_features) # Do not stop the iterations if there are some iterations left or if the likelihood is increasing # or if we have not reached the maximum patience and if a new architecture was looked for # in the previous iteration while ((it_num < it) & (ratio > eps) & (patience <= max_patience)) | is_looking_for_better_arch: print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not(isnumeric(n_clusters)): if n_clusters == 'auto': clustering_layer = 0 else: raise ValueError('Please enter an int or "auto" for n_clusters') else: assert (np.array(k) == n_clusters).any() clustering_layer = np.argmax(np.array(k) == n_clusters) ##################################################################################### ################################# S step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for all s in Omega #===================================================================== mu_s, sigma_s = compute_path_params(eta, H, psi) sigma_s = ensure_psd(sigma_s) z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M) #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== chsi = compute_chsi(H, psi, mu_s, sigma_s) chsi = ensure_psd(chsi) rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s = draw_z2_z1s(chsi, rho, M, r) #======================================================================= # Compute the p(y| z1) for all variable categories #======================================================================= py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord, \ lambda_categ, y_categ, nj_categ, y_cont, lambda_cont, z_s[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys = draw_zl1_ys(z_s, py_zl1, M) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute conditional probabilities used in the appendix of asta paper #===================================================================== pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s, py_zl1) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S) pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s) #===================================================================== # Compute MFA expectations #===================================================================== Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \ E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute MFA Parameters #======================================================= w_s = np.mean(ps_y, axis = 0) eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H, k) #======================================================= # Identifiability conditions #======================================================= # Update eta, H and Psi values H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del(Ez) #======================================================= # Compute GLLVM Parameters #======================================================= lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_cont = cont_params_GLLVM(y_cont, lambda_cont, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(p_y)) likelihood.append(new_lik) silhouette.append(new_sil) ratio = abs((new_lik - prev_lik)/prev_lik) idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) psl_y = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum) temp_class = np.argmax(psl_y, axis = 1) try: new_sil = silhouette_score(dm, temp_class, metric = 'precomputed') except ValueError: new_sil = -1 # Store the params according to the silhouette or likelihood is_better = (best_sil < new_sil) if use_silhouette else (best_lik < new_lik) if is_better: z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1) best_sil = deepcopy(new_sil) classes = deepcopy(temp_class) ''' plt.figure(figsize=(8,8)) plt.scatter(z[:, 0], z[:, 1], c = classes) plt.show() ''' # Store the output out['classes'] = deepcopy(classes) out['best_z'] = deepcopy(z_s[0]) out['Ez.y'] = z out['best_k'] = deepcopy(k) out['best_r'] = deepcopy(r) out['best_w_s'] = deepcopy(w_s) out['lambda_bin'] = deepcopy(lambda_bin) out['lambda_ord'] = deepcopy(lambda_ord) out['lambda_categ'] = deepcopy(lambda_categ) out['lambda_cont'] = deepcopy(lambda_cont) out['eta'] = deepcopy(eta) out['mu'] = deepcopy(mu_s) out['sigma'] = deepcopy(sigma_s) out['psl_y'] = deepcopy(psl_y) out['ps_y'] = deepcopy(ps_y) # Refresh the classes only if they provide a better explanation of the data if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 2, r, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### min_nb_clusters = 2 if isnumeric(n_clusters): # To change when add multi mode is_not_min_specif = not(np.all(np.array(k) == n_clusters) & np.array_equal(r, [2,1])) else: is_not_min_specif = not(np.all(np.array(k) == min_nb_clusters) & np.array_equal(r, [2,1])) is_looking_for_better_arch = look_for_simpler_network(it_num) & perform_selec & is_not_min_specif if is_looking_for_better_arch: r_to_keep = r_select(y_bin, y_ord, y_categ, y_cont, zl1_ys, z2_z1s, w_s) # If r_l == 0, delete the last l + 1: layers new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1 k_to_keep = k_select(w_s, k, new_L, clustering_layer, not(isnumeric(n_clusters))) is_L_unchanged = (L == new_L) is_r_unchanged = np.all([len(r_to_keep[l]) == r[l] for l in range(new_L + 1)]) is_k_unchanged = np.all([len(k_to_keep[l]) == k[l] for l in range(new_L)]) is_selection = not(is_r_unchanged & is_k_unchanged & is_L_unchanged) assert new_L > 0 if is_selection: eta = [eta[l][k_to_keep[l]] for l in range(new_L)] eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][k_to_keep[l]] for l in range(new_L)] H = [H[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)] psi = [psi[l][k_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)] if nb_bin > 0: # Add the intercept: bin_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) lambda_bin = lambda_bin[:, bin_r_to_keep] if nb_ord > 0: # Intercept coefficients handling is a little more complicated here lambda_ord_intercept = [lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord] Lambda_ord_var = np.stack([lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord]) Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]] lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\ for j in range(nb_ord)] # To recheck if nb_cont > 0: # Add the intercept: cont_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1]) lambda_cont = lambda_cont[:, cont_r_to_keep] if nb_categ > 0: lambda_categ_intercept = [lambda_categ[j][:, 0] for j in range(nb_categ)] Lambda_categ_var = [lambda_categ_j[:,-r[0]:] for lambda_categ_j in lambda_categ] Lambda_categ_var = [lambda_categ_j[:, r_to_keep[0]] for lambda_categ_j in lambda_categ] lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\ for j in range(nb_categ)] w = w_s.reshape(*k, order = 'C') new_k_idx_grid = np.ix_(*k_to_keep[:new_L]) # If layer deletion, sum the last components of the paths if L > new_L: deleted_dims = tuple(range(L)[new_L:]) w_s = w[new_k_idx_grid].sum(deleted_dims).flatten(order = 'C') else: w_s = w[new_k_idx_grid].flatten(order = 'C') w_s /= w_s.sum() # Refresh the classes: TO RECHECK #idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) #ps_y_tmp = ps_y.reshape(numobs, *k, order = 'C').sum(idx_to_sum) #np.argmax(ps_y_tmp[:, k_to_keep[0]], axis = 1) k = [len(k_to_keep[l]) for l in range(new_L)] r = [len(r_to_keep[l]) for l in range(new_L + 1)] k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)]) L = new_L patience = 0 # Identifiability conditions H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del(Ez) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S',S) print("w_s", len(w_s)) prev_lik = deepcopy(new_lik) it_num = it_num + 1 print(likelihood) print(silhouette) out['likelihood'] = likelihood out['silhouette'] = silhouette return(out)
def E_step_DGMM_t(H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1st, S_1L, L, k_1L): ''' Compute the expectations of the E step for all DGMM layers of the commont tail zl1_ys ((M1, numobs, r1, S1) nd-array): z^{(1)} | y, s H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters for each layer z_s (list of nd-arrays): zl | s^l for all s^l and all l. zc_s (list of nd-arrays): (zl | s^l) - eta{k_l}^{(l)} for all s^l and all l. z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s pz_ys (list of ndarrays): p(z^{l} | y, s) pz2_z1st (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y) S_1L (dict): The number of paths starting at each layer in the network L (dict): The number of layers on each head or tail k_1L (list of int): The number of component on each layer including the common layers ------------------------------------------------------------ returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s), E(z^{(l+1)}z^{(l+1)T} | y, s), E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)} ''' Ez_ys = [] E_z1z2T_ys = [] E_z2z2T_ys = [] EeeT_ys = [] kt = k_1L['t'] for l in range(L['t'] - 1): #=============================================== # Broadcast the quantities to the right shape #=============================================== z1_s = z_s[l].transpose((0, 2, 1)) z1_s = np.expand_dims(np.expand_dims(z1_s, 1), 2) z1_s = np.tile(z1_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l], 1)) z1c_s = np.expand_dims(np.expand_dims(zc_s[l], 1), 2) z1c_s = np.tile(z1c_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l], 1)) z2_s = t(z_s[l + 1], (0, 2, 1)) z2_s = np.expand_dims(np.expand_dims(z2_s, 1), 2) z2_s = np.tile(z2_s, (1, 1, 1, S_1L['t'][0] // S_1L['t'][l + 1], 1)) pz1_ys = pz_ys[l][..., n_axis] pz2_ys = pz_ys[l + 1][..., n_axis] pz2_z1s = np.expand_dims(np.expand_dims(pz2_z1st[l], 2), 2)[n_axis] if l == 0: Ez_ys_l = (pz1_ys * z1_s[n_axis]).sum(1) Ez_ys.append(Ez_ys_l) H_formated = np.tile(H[l], (np.prod(kt[:l]), 1, 1)) H_formated = np.repeat(H_formated, S_1L['t'][l + 1], axis=0) H_formated = H_formated[n_axis, n_axis, n_axis] #=============================================== # Compute the expectations #=============================================== #========================================================= # E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1} #========================================================= E_z2_z1s = z2_z1s[l].mean(1) E_z2_z1s = np.tile(E_z2_z1s, (1, S_1L['t'][0] // S_1L['t'][l], 1)) #========================================================= # E(z^{l + 1}z^{l + 1}^T | z^{l}, s) #========================================================= E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \ np.expand_dims(z2_z1s[l], 3)).mean(1) E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, S_1L['t'][0] // S_1L['t'][l], 1, 1)) # Create a new axis for the information coming from the discrete head E_z2_z1s = np.expand_dims(np.expand_dims(E_z2_z1s, 1), 1) E_z2z2T_z1s = np.expand_dims(np.expand_dims(E_z2z2T_z1s, 1), 1) #========================================================== # E(z^{l + 1} | y, s) = integral_z^l [ p(z^l | y, s) * E(z^{l + 1} | z^l, s) ] #========================================================== E_z2_ys_l = (pz2_ys * z2_s[n_axis]).sum(1) Ez_ys.append(E_z2_ys_l) #========================================================= # E(z^{l}z^{l + 1}^T | y, s) #========================================================= E_z1z2T_ys_l = (pz1_ys[..., n_axis] * \ (z1_s[..., n_axis] @ np.expand_dims(E_z2_z1s, 4))[n_axis]).sum(1) E_z1z2T_ys.append(E_z1z2T_ys_l) #========================================================= # E(z^{l + 1}z^{l + 1}^T | y, s) #========================================================= E_z2z2T_ys_l = (pz1_ys[..., n_axis] * E_z2z2T_z1s[n_axis]).sum(1) E_z2z2T_ys.append(E_z2z2T_ys_l) #========================================================= # E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s] #========================================================= pz1z2_ys = np.expand_dims(pz_ys[l], 2) * pz2_z1s pz1z2_ys = pz1z2_ys[..., n_axis, n_axis] e = np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s[..., n_axis], (5, 0, 1, 2, 3, 4)) e = e[..., n_axis] eeT = e @ t(e, (0, 1, 2, 3, 4, 6, 5)) eeT = eeT[n_axis] EeeT_ys_l = (pz1z2_ys * eeT).sum((1, 2)) EeeT_ys.append(EeeT_ys_l) return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys
def DDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data r (list): The dimension of latent variables through the first 2 layers k (list): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' prev_lik = -1E16 best_lik = -1E16 tol = 0.01 max_patience = 1 patience = 0 best_k = deepcopy(k) best_r = deepcopy(r) best_sil = -1 new_sil = -1 # Initialize the parameters eta = deepcopy(init['eta']) psi = deepcopy(init['psi']) lambda_bin = deepcopy(init['lambda_bin']) lambda_ord = deepcopy(init['lambda_ord']) lambda_categ = deepcopy(init['lambda_categ']) H = deepcopy(init['H']) w_s = deepcopy( init['w_s'] ) # Probability of path s' through the network for all s' in Omega numobs = len(y) likelihood = [] it_num = 0 ratio = 1000 np.random.seed = seed # Dispatch variables between categories y_bin = y[:, np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')].astype(int) nb_bin = len(nj_bin) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'].astype(int) nb_ord = len(nj_ord) L = len(k) k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(L + 1)]) M = M_growth(1, r, numobs) assert nb_ord + nb_bin + nb_categ > 0 # Compute the Gower matrix cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features=cat_features) while (it_num < it) & ((ratio > eps) | (patience <= max_patience)): print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) ##################################################################################### ################################# S step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for all s in Omega #===================================================================== mu_s, sigma_s = compute_path_params(eta, H, psi) sigma_s = ensure_psd(sigma_s) z_s, zc_s = draw_z_s(mu_s, sigma_s, eta, M) ''' print('mu_s', np.abs(mu_s[0]).mean()) print('sigma_s', np.abs(sigma_s[0]).mean()) print('z_s0', np.abs(z_s[0]).mean()) print('z_s1', np.abs(z_s[1]).mean(0)[:,0]) ''' #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== chsi = compute_chsi(H, psi, mu_s, sigma_s) chsi = ensure_psd(chsi) rho = compute_rho(eta, H, psi, mu_s, sigma_s, zc_s, chsi) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s = draw_z2_z1s(chsi, rho, M, r) #======================================================================= # Compute the p(y| z1) for all variable categories #======================================================================= py_zl1 = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord, lambda_categ, y_categ, nj_categ, z_s[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys = draw_zl1_ys(z_s, py_zl1, M) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute conditional probabilities used in the appendix of asta paper #===================================================================== pzl1_ys, ps_y, p_y = E_step_GLLVM(z_s[0], mu_s[0], sigma_s[0], w_s, py_zl1) #del(py_zl1) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== pz2_z1s = fz2_z1s(t(pzl1_ys, (1, 0, 2)), z2_z1s, chsi, rho, S) pz_ys = fz_ys(t(pzl1_ys, (1, 0, 2)), pz2_z1s) #===================================================================== # Compute MFA expectations #===================================================================== Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys = \ E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute MFA Parameters #======================================================= w_s = np.mean(ps_y, axis=0) eta, H, psi = M_step_DGMM(Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys, ps_y, H, k) #======================================================= # Identifiability conditions #======================================================= # Update eta, H and Psi values H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) del (Ez) #======================================================= # Compute GLLVM Parameters #======================================================= # We optimize each column separately as it is faster than all column jointly # (and more relevant with the independence hypothesis) lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y, pzl1_ys, z_s[0], AT[0],\ tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(p_y)) likelihood.append(new_lik) ratio = (new_lik - prev_lik) / abs(prev_lik) print(likelihood) idx_to_sum = tuple(set(range(1, L + 1)) - set([clustering_layer + 1])) psl_y = ps_y.reshape(numobs, *k, order='C').sum(idx_to_sum) temp_class = np.argmax(psl_y, axis=1) try: new_sil = silhouette_score(dm, temp_class, metric='precomputed') except ValueError: new_sil = -1 print('Silhouette score:', new_sil) if best_sil < new_sil: z = (ps_y[..., n_axis] * Ez_ys[clustering_layer]).sum(1) best_sil = deepcopy(new_sil) classes = deepcopy(temp_class) fig = plt.figure(figsize=(8, 8)) plt.scatter(z[:, 0], z[:, 1]) plt.show() # Refresh the classes only if they provide a better explanation of the data if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 2, r, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### is_not_min_specif = not (np.all(np.array(k) == n_clusters) & np.array_equal(r, [2, 1])) if look_for_simpler_network( it_num) & perform_selec & is_not_min_specif: r_to_keep = r_select(y_bin, y_ord, y_categ, zl1_ys, z2_z1s, w_s) # If r_l == 0, delete the last l + 1: layers new_L = np.sum([len(rl) != 0 for rl in r_to_keep]) - 1 k_to_keep = k_select(w_s, k, new_L, clustering_layer) is_L_unchanged = L == new_L is_r_unchanged = np.all( [len(r_to_keep[l]) == r[l] for l in range(new_L + 1)]) is_k_unchanged = np.all( [len(k_to_keep[l]) == k[l] for l in range(new_L)]) is_selection = not (is_r_unchanged & is_k_unchanged & is_L_unchanged) assert new_L > 0 if is_selection: eta = [eta[l][k_to_keep[l]] for l in range(new_L)] eta = [eta[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][k_to_keep[l]] for l in range(new_L)] H = [H[l][:, r_to_keep[l]] for l in range(new_L)] H = [H[l][:, :, r_to_keep[l + 1]] for l in range(new_L)] psi = [psi[l][k_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, r_to_keep[l]] for l in range(new_L)] psi = [psi[l][:, :, r_to_keep[l]] for l in range(new_L)] if nb_bin > 0: # Add the intercept: bin_r_to_keep = np.concatenate([[0], np.array(r_to_keep[0]) + 1 ]) lambda_bin = lambda_bin[:, bin_r_to_keep] if nb_ord > 0: # Intercept coefficients handling is a little more complicated here lambda_ord_intercept = [ lambda_ord_j[:-r[0]] for lambda_ord_j in lambda_ord ] Lambda_ord_var = np.stack( [lambda_ord_j[-r[0]:] for lambda_ord_j in lambda_ord]) Lambda_ord_var = Lambda_ord_var[:, r_to_keep[0]] lambda_ord = [np.concatenate([lambda_ord_intercept[j], Lambda_ord_var[j]])\ for j in range(nb_ord)] if nb_categ > 0: lambda_categ_intercept = [ lambda_categ[j][:, 0] for j in range(nb_categ) ] Lambda_categ_var = [ lambda_categ_j[:, -r[0]:] for lambda_categ_j in lambda_categ ] Lambda_categ_var = [ lambda_categ_j[:, r_to_keep[0]] for lambda_categ_j in lambda_categ ] lambda_categ = [np.hstack([lambda_categ_intercept[j][..., n_axis], Lambda_categ_var[j]])\ for j in range(nb_categ)] w = w_s.reshape(*k, order='C') new_k_idx_grid = np.ix_(*k_to_keep[:new_L]) # If layer deletion, sum the last components of the paths if L > new_L: deleted_dims = tuple(range(L)[new_L:]) w_s = w[new_k_idx_grid].sum(deleted_dims).flatten( order='C') else: w_s = w[new_k_idx_grid].flatten(order='C') w_s /= w_s.sum() k = [len(k_to_keep[l]) for l in range(new_L)] r = [len(r_to_keep[l]) for l in range(new_L + 1)] k_aug = k + [1] S = np.array([np.prod(k_aug[l:]) for l in range(new_L + 1)]) L = new_L patience = 0 best_r = deepcopy(r) best_k = deepcopy(k) # Identifiability conditions H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S', S) print("w_s", len(w_s)) prev_lik = deepcopy(new_lik) it_num = it_num + 1 out = dict(likelihood = likelihood, classes = classes, z = z, \ best_r = best_r, best_k = best_k) return (out)
def E_step_DGMM(zl1_ys, H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, S): ''' Compute the expectations of the E step for all DGMM layers zl1_ys ((M1, numobs, r1, S1) nd-array): z^{(1)} | y, s H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters for each layer z_s (list of nd-arrays): zl | s^l for all s^l and all l. zc_s (list of nd-arrays): (zl | s^l) - eta{k_l}^{(l)} for all s^l and all l. z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s pz_ys (list of ndarrays): p(z^{l} | y, s) pz2_z1s (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y) S (list of int): The number of paths starting at each layer ------------------------------------------------------------ returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s), E(z^{(l+1)}z^{(l+1)T} | y, s), E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)} ''' L = len(H) k = [H[l].shape[0] for l in range(L)] Ez_ys = [] E_z1z2T_ys = [] E_z2z2T_ys = [] EeeT_ys = [] Ez_ys.append(t(np.mean(zl1_ys, axis=0), (0, 2, 1))) for l in range(L): # Broadcast the quantities to the right shape z1_s = z_s[l].transpose((0, 2, 1))[..., n_axis] z1_s = np.tile(z1_s, (1, np.prod(k[:l]), 1, 1)) # To recheck when L > 3 z1c_s = np.tile(zc_s[l], (1, np.prod(k[:l]), 1)) z2_s = t(z_s[l + 1], (0, 2, 1)) z2_s = np.tile(z2_s, (1, S[0] // S[l + 1], 1))[..., n_axis] pz1_ys = pz_ys[l][..., n_axis] H_formated = np.tile(H[l], (np.prod(k[:l]), 1, 1)) H_formated = np.repeat(H_formated, S[l + 1], axis=0)[n_axis] # Compute the expectations ### E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1} # with z^{l + 1} drawn from p(z^{l + 1} | z^{l}, s) E_z2_z1s = z2_z1s[l].mean(1) E_z2_z1s = np.tile(E_z2_z1s, (1, S[0] // S[l], 1)) ### E(z^{l + 1}z^{l + 1}^T | z^{l}, s) = sum_{m2=1}^M2 z2_m2 @ z2_m2T E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \ np.expand_dims(z2_z1s[l], 3)).mean(1) E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, S[0] // S[l], 1, 1)) #### E(z^{l + 1} | y, s) = integral_z^l [ p(z^l | y, s) * E(z^{l + 1} | z^l, s) ] E_z2_ys_l = (pz1_ys * E_z2_z1s[n_axis]).sum(1) Ez_ys.append(E_z2_ys_l) ### E(z^{l}z^{l + 1}T | y, s) = integral_z^l [ p(z^l | y, s) * z^l @ E(z^{l + 1}T | z^l, s) ] E_z1z2T_ys_l = (pz1_ys[..., n_axis] * \ (z1_s @ np.expand_dims(E_z2_z1s, 2))[n_axis]).sum(1) E_z1z2T_ys.append(E_z1z2T_ys_l) ### E(z^{l + 1}z^{l + 1}T | y, s) = integral_z^l [ p(z^l | y, s) @ E(z^{l + 1}z^{l + 1}T | z1, s) ] E_z2z2T_ys_l = (pz1_ys[..., n_axis] * E_z2z2T_z1s[n_axis]).sum(1) E_z2z2T_ys.append(E_z2z2T_ys_l) ### E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s] pz1z2_ys = np.expand_dims(pz_ys[l], 2) * pz2_z1s[l + 1][n_axis] pz1z2_ys = pz1z2_ys[..., n_axis, n_axis] e = (np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s, (3, 0, 1, 2)))[..., n_axis] eeT = e @ t(e, (0, 1, 2, 4, 3)) EeeT_ys_l = (pz1z2_ys * eeT[n_axis]).sum((1, 2)) EeeT_ys.append(EeeT_ys_l) return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys
def MDGMM(y, n_clusters, r, k, init, var_distrib, nj, it = 50, \ eps = 1E-05, maxstep = 100, seed = None, perform_selec = True): ''' Fit a Generalized Linear Mixture of Latent Variables Model (GLMLVM) y (numobs x p ndarray): The observations containing mixed variables n_clusters (int or str): The number of clusters to look for in the data or the use mode of the MDGMM r (dict): The dimension of latent variables through the first 2 layers k (dict): The number of components of the latent Gaussian mixture layers init (dict): The initialisation parameters for the algorithm var_distrib (p 1darray): An array containing the types of the variables in y nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable For categorical data: the number of different existing categories for each variable it (int): The maximum number of MCEM iterations of the algorithm eps (float): If the likelihood increase by less than eps then the algorithm stops maxstep (int): The maximum number of optimisation step for each variable seed (int): The random state seed to set (Only for numpy generated data for the moment) perform_selec (Bool): Whether to perform architecture selection or not ------------------------------------------------------------------------------------------------ returns (dict): The predicted classes, the likelihood through the EM steps and a continuous representation of the data ''' # Break the reference link k = deepcopy(k) r = deepcopy(r) best_k = deepcopy(k) best_r = deepcopy(r) # Add other checks for the other variables check_inputs(k, r) prev_lik = - 1E15 best_lik = -1E15 tol = 0.01 max_patience = 1 patience = 0 #==================================================== # Initialize the parameters #==================================================== eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) numobs = len(y) likelihood = [] it_num = 0 ratio = 1000 np.random.seed = seed #==================================================== # Dispatch variables between categories #==================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nj_bin = nj_bin.astype(int) nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'] nj_ord = nj[var_distrib == 'ordinal'] nj_ord = nj_ord.astype(int) nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'].astype(int) nb_categ = len(nj_categ) yc = y[:, var_distrib == 'continuous'] ss = StandardScaler() yc = ss.fit_transform(yc) nb_cont = yc.shape[1] # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} best_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 new_sil = [-1.1 for l in range(L['t'] - 1)] if n_clusters == 'multi' else -1.1 M = M_growth(1, r_1L, numobs) if nb_bin + nb_ord + nb_categ == 0: # Create the InputError class and change this raise ValueError('Input does not contain discrete variables,\ consider using a regular DGMM') if nb_cont == 0: # Create the InputError class and change this raise ValueError('Input does not contain continuous values,\ consider using a DDGMM') # Compute the Gower matrix cat_features = np.logical_or(var_distrib == 'categorical', var_distrib == 'bernoulli') dm = gower_matrix(y, cat_features = cat_features) while (it_num < it) & ((ratio > eps) | (patience <= max_patience)): print(it_num) # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not(isnumeric(n_clusters)): if n_clusters == 'auto': clustering_layer = 0 elif n_clusters == 'multi': clustering_layer = list(range(L['t'] - 1)) else: raise ValueError('Please enter an int, auto or multi for n_clusters') else: assert (np.array(k['t']) == n_clusters).any() clustering_layer = np.argmax(np.array(k['t']) == n_clusters) ##################################################################################### ################################# MC step ############################################ ##################################################################################### #===================================================================== # Draw from f(z^{l} | s, Theta) for both heads and tail #===================================================================== mu_s_c, sigma_s_c = compute_path_params(eta_c, H_c, psi_c) sigma_s_c = ensure_psd(sigma_s_c) mu_s_d, sigma_s_d = compute_path_params(eta_d, H_d, psi_d) sigma_s_d = ensure_psd(sigma_s_d) z_s_c, zc_s_c, z_s_d, zc_s_d = draw_z_s_all_network(mu_s_c, sigma_s_c,\ mu_s_d, sigma_s_d, yc, eta_c, eta_d, S_1L, L, M) #======================================================================== # Draw from f(z^{l+1} | z^{l}, s, Theta) for l >= 1 #======================================================================== # Create wrapper as before and after chsi_c = compute_chsi(H_c, psi_c, mu_s_c, sigma_s_c) chsi_c = ensure_psd(chsi_c) rho_c = compute_rho(eta_c, H_c, psi_c, mu_s_c, sigma_s_c, zc_s_c, chsi_c) chsi_d = compute_chsi(H_d, psi_d, mu_s_d, sigma_s_d) chsi_d = ensure_psd(chsi_d) rho_d = compute_rho(eta_d, H_d, psi_d, mu_s_d, sigma_s_d, zc_s_d, chsi_d) # In the following z2 and z1 will denote z^{l+1} and z^{l} respectively z2_z1s_c, z2_z1s_d = draw_z2_z1s_network(chsi_c, chsi_d, rho_c, \ rho_d, M, r_1L, L) #======================================================================= # Compute the p(y^D| z1) for all discrete variables #======================================================================= py_zl1_d = fy_zl1(lambda_bin, y_bin, nj_bin, lambda_ord, y_ord, nj_ord,\ lambda_categ, y_categ, nj_categ, z_s_d[0]) #======================================================================== # Draw from p(z1 | y, s) proportional to p(y | z1) * p(z1 | s) for all s #======================================================================== zl1_ys_d = draw_zl1_ys(z_s_d, py_zl1_d, M['d']) ##################################################################################### ################################# E step ############################################ ##################################################################################### #===================================================================== # Compute quantities necessary for E steps of both heads and tail #===================================================================== # Discrete head quantities pzl1_ys_d, ps_y_d, py_d = E_step_GLLVM(z_s_d[0], mu_s_d[0], sigma_s_d[0], w_s_d, py_zl1_d) py_s_d = ps_y_d * py_d / w_s_d[n_axis] # Continuous head quantities ps_y_c, py_s_c, py_c = continuous_lik(yc, mu_s_c[0], sigma_s_c[0], w_s_c) pz_s_d = fz_s(z_s_d, mu_s_d, sigma_s_d) pz_s_c = fz_s(z_s_c, mu_s_c, sigma_s_c) #===================================================================== # Compute p(z^{(l)}| s, y). Equation (5) of the paper #===================================================================== # Compute pz2_z1s_d and pz2_z1s_d for the tail indices whereas it is useless pz2_z1s_d = fz2_z1s(t(pzl1_ys_d, (1, 0, 2)), z2_z1s_d, chsi_d, rho_d, S_1L['d']) pz_ys_d = fz_ys(t(pzl1_ys_d, (1, 0, 2)), pz2_z1s_d) pz2_z1s_c = fz2_z1s([], z2_z1s_c, chsi_c, rho_c, S_1L['c']) pz_ys_c = fz_ys([], pz2_z1s_c) pz2_z1s_t = fz2_z1s([], z2_z1s_c[bar_L['c']:], chsi_c[bar_L['c']:], \ rho_c[bar_L['c']:], S_1L['t']) # Junction layer computations # Compute p(zC |s) py_zs_d = fy_zs(pz_ys_d, py_s_d) py_zs_c = fy_zs(pz_ys_c, py_s_c) # Compute p(zt | yC, yD, sC, SD) pzt_yCyDs = fz_yCyDs(py_zs_c, pz_ys_d, py_s_c, M, S_1L, L) #===================================================================== # Compute MFA expectations #===================================================================== # Discrete head. Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, EeeT_ys_d = \ E_step_DGMM_d(zl1_ys_d, H_d, z_s_d, zc_s_d, z2_z1s_d, pz_ys_d,\ pz2_z1s_d, S_1L['d'], L['d']) # Continuous head Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, EeeT_ys_c = \ E_step_DGMM_c(H_c, z_s_c, zc_s_c, z2_z1s_c, pz_ys_c,\ pz2_z1s_c, S_1L['c'], L['c']) # Junction layers Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, EeeT_ys_t = \ E_step_DGMM_t(H_c[bar_L['c']:], \ z_s_c[bar_L['c']:], zc_s_c[bar_L['c']:], z2_z1s_c[bar_L['c']:],\ pzt_yCyDs, pz2_z1s_t, S_1L, L, k_1L) # Error here for the first two terms: p(y^h | z^t, s^C) != p(y^h | z^t, s^{1C:L}) pst_yCyD = fst_yCyD(py_zs_c, py_zs_d, pz_s_d, w_s_c, w_s_d, k_1L, L) ########################################################################### ############################ M step ####################################### ########################################################################### #======================================================= # Compute DGMM Parameters #======================================================= # Discrete head w_s_d = np.mean(ps_y_d, axis = 0) eta_d_barL, H_d_barL, psi_d_barL = M_step_DGMM(Ez_ys_d, E_z1z2T_ys_d, E_z2z2T_ys_d, \ EeeT_ys_d, ps_y_d, H_d, k_1L['d'][:-1],\ L_1L['d'], r_1L['d']) # Add dispatching function here eta_d[:bar_L['d']] = eta_d_barL H_d[:bar_L['d']] = H_d_barL psi_d[:bar_L['d']] = psi_d_barL # Continuous head w_s_c = np.mean(ps_y_c, axis = 0) eta_c_barL, H_c_barL, psi_c_barL = M_step_DGMM(Ez_ys_c, E_z1z2T_ys_c, E_z2z2T_ys_c, \ EeeT_ys_c, ps_y_c, H_c, k_1L['c'][:-1],\ L_1L['c'] + 1, r_1L['c']) eta_c[:bar_L['c']] = eta_c_barL H_c[:bar_L['c']] = H_c_barL psi_c[:bar_L['c']] = psi_c_barL # Common tail eta_t, H_t, psi_t, Ezst_y = M_step_DGMM_t(Ez_ys_t, E_z1z2T_ys_t, E_z2z2T_ys_t, \ EeeT_ys_t, ps_y_c, ps_y_d, pst_yCyD, \ H_c[bar_L['c']:], S_1L, k_1L, \ L_1L, L, r_1L['t']) eta_d[bar_L['d']:] = eta_t H_d[bar_L['d']:] = H_t psi_d[bar_L['d']:] = psi_t eta_c[bar_L['c']:] = eta_t H_c[bar_L['c']:] = H_t psi_c[bar_L['c']:] = psi_t #======================================================= # Identifiability conditions #======================================================= w_s_t = np.mean(pst_yCyD, axis = 0) eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) #======================================================= # Compute GLLVM Parameters #======================================================= # We optimize each column separately as it is faster than all column jointly # (and more relevant with the independence hypothesis) lambda_bin = bin_params_GLLVM(y_bin, nj_bin, lambda_bin, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_ord = ord_params_GLLVM(y_ord, nj_ord, lambda_ord, ps_y_d, \ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) lambda_categ = categ_params_GLLVM(y_categ, nj_categ, lambda_categ, ps_y_d,\ pzl1_ys_d, z_s_d[0], AT_d[0], tol = tol, maxstep = maxstep) ########################################################################### ################## Clustering parameters updating ######################### ########################################################################### new_lik = np.sum(np.log(py_d) + np.log(py_c)) likelihood.append(new_lik) ratio = (new_lik - prev_lik)/abs(prev_lik) if n_clusters == 'multi': temp_classes = [] z_tail = [] classes = [[] for l in range(L['t'] - 1)] for l in clustering_layer: idx_to_sum = tuple(set(range(1, L['t'] + 1)) -\ set([clustering_layer[l] + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'],\ order = 'C').sum(idx_to_sum) temp_class_l = np.argmax(psl_y, axis = 1) sil_l = silhouette_score(dm, temp_class_l, metric = 'precomputed') temp_classes.append(temp_class_l) #z_tail.append(Ezst_y[l].sum(1)) new_sil[l] = sil_l #z_tail = [] for l in range(L['t'] - 1): zl = Ezst_y[l].sum(1) z_tail.append(zl) if best_sil[l] < new_sil[l]: # Update the quantity if the silhouette score is better best_sil[l] = deepcopy(new_sil[l]) classes[l] = deepcopy(temp_classes[l]) if zl.shape[-1] == 3: plot_3d(zl, classes[l]) elif zl.shape[-1] == 2: plot_2d(zl, classes[l]) else: idx_to_sum = tuple(set(range(1, L['t'] + 1)) - set([clustering_layer + 1])) psl_y = pst_yCyD.reshape(numobs, *k['t'], order = 'C').sum(idx_to_sum) temp_classes = np.argmax(psl_y, axis = 1) try: new_sil = silhouette_score(dm, temp_classes, metric = 'precomputed') except: new_sil = -1 z_tail = [Ezst_y[l].sum(1) for l in range(L['t'] - 1)] if best_sil < new_sil: # Update the quantity if the silhouette score is better zl = z_tail[clustering_layer] best_sil = deepcopy(new_sil) classes = deepcopy(temp_classes) if zl.shape[-1] == 3: plot_3d(zl, classes) elif zl.shape[-1] == 2: plot_2d(zl, classes) # Refresh the likelihood if best if best_lik < new_lik: best_lik = deepcopy(prev_lik) if prev_lik < new_lik: patience = 0 M = M_growth(it_num + 1, r_1L, numobs) else: patience += 1 ########################################################################### ######################## Parameter selection ############################# ########################################################################### min_nb_clusters = 2 is_not_min_specif = not(is_min_architecture_reached(k, r, min_nb_clusters)) if look_for_simpler_network(it_num) & perform_selec & is_not_min_specif: # To add: selection according to categ r_to_keep = r_select(y_bin, y_ord, y_categ, yc, zl1_ys_d,\ z2_z1s_d[:bar_L['d']], w_s_d, z2_z1s_c[:bar_L['c']], z2_z1s_c[bar_L['c']:], n_clusters) # Check layer deletion is_c_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['c']]) is_d_layer_deletion = np.any([len(rl) == 0 for rl in r_to_keep['d']]) is_head_layer_deletion = np.any([is_c_layer_deletion, is_d_layer_deletion]) if is_head_layer_deletion: # Restart the algorithm if is_c_layer_deletion: r['c'] = [len(rl) for rl in r_to_keep['c'][:-1]] k['c'] = k['c'][:-1] if is_d_layer_deletion: r['d'] = [len(rl) for rl in r_to_keep['d'][:-1]] k['d'] = k['d'][:-1] init = dim_reduce_init(pd.DataFrame(y), n_clusters, k, r, nj, var_distrib,\ seed = None) eta_c, eta_d, H_c, H_d, psi_c, psi_d = dispatch_dgmm_init(init) lambda_bin, lambda_ord, lambda_categ = dispatch_gllvm_init(init) w_s_c, w_s_d = dispatch_paths_init(init) # *_1L standsds for quantities going through all the network (head + tail) k_1L, L_1L, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) it_num = it_num + 1 print(likelihood) print('Restarting the algorithm') continue new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 # If r_l == 0, delete the last l + 1: layers new_Lt = np.sum([len(rl) != 0 for rl in r_to_keep['t']]) #- 1 #w_s_t = pst_yCyD.mean(0) k_to_keep = k_select(w_s_c, w_s_d, w_s_t, k, new_Lt, clustering_layer, n_clusters) is_selection = check_if_selection(r_to_keep, r, k_to_keep, k, L, new_Lt) assert new_Lt > 0 # > 1 ? if n_clusters == 'multi': assert new_Lt == L['t'] if is_selection: # Part to change when update also number of layers on each head nb_deleted_layers_tail = L['t'] - new_Lt L['t'] = new_Lt L_1L = {keys: values - nb_deleted_layers_tail for keys, values in L_1L.items()} eta_c, eta_d, H_c, H_d, psi_c, psi_d = dgmm_coeff_selection(eta_c,\ H_c, psi_c, eta_d, H_d, psi_d, L, r_to_keep, k_to_keep) lambda_bin, lambda_ord, lambda_categ = gllvm_coeff_selection(lambda_bin, lambda_ord,\ lambda_categ, r, r_to_keep) w_s_c, w_s_d = path_proba_selection(w_s_c, w_s_d, k, k_to_keep, new_Lt) k = {h: [len(k_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} k['c'] = [len(k_to_keep['c'][l]) for l in range(L['c'] + 1)] r = {h: [len(r_to_keep[h][l]) for l in range(L[h])] for h in ['d', 't']} r['c'] = [len(r_to_keep['c'][l]) for l in range(L['c'] + 1)] k_1L, _, L, bar_L, S_1L = nb_comps_and_layers(k) r_1L = {'c': r['c'] + r['t'], 'd': r['d'] + r['t'], 't': r['t']} patience = 0 best_r = deepcopy(r) best_k = deepcopy(k) #======================================================= # Identifiability conditions #======================================================= eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) print('New architecture:') print('k', k) print('r', r) print('L', L) print('S_1L', S_1L) print("w_s_c", len(w_s_c)) print("w_s_d", len(w_s_d)) M = M_growth(it_num + 1, r_1L, numobs) prev_lik = deepcopy(new_lik) print(likelihood) print('Silhouette score:', new_sil) it_num = it_num + 1 out = dict(likelihood = likelihood, classes = classes, \ best_r = best_r, best_k = best_k) if n_clusters == 'multi': out['z'] = z_tail else: out['z'] = z_tail[clustering_layer] return(out)
def E_step_DGMM_c(H, z_s, zc_s, z2_z1s, pz_ys, pz2_z1s, Sc, Lc): ''' Compute the expectations of the E step for all DGMM layers of the continuous head H (list of nb_layers elements of shape (K_l x r_l-1, r_l)): Lambda parameters for each layer z_s (list of nd-arrays): z^{(l)} | s^l for all s^l and all l. zc_s (list of nd-arrays): (z^{(l)C} | s^{(l)C} - eta{k_l}^{(l)C} for all s^l and all l. z2_z1s (list of ndarrays): z^{(l + 1)}| z^{(l)}, s pz_ys (list of ndarrays): p(z^{l} | y, s) pz2_z1s (list of ndarrays): p(z^{(l)}| z^{(l-1)}, y) Sc (list of int): The number of paths starting at each layer of the continuous head Lc (list of int): The number of layers on the continuous head ------------------------------------------------------------ returns (tuple of ndarrays): E(z^{(l)} | y, s), E(z^{(l)}z^{(l+1)T} | y, s), E(z^{(l+1)}z^{(l+1)T} | y, s), E(e | y, s) with e = z^{(l)} - eta{k_l}^{(l)} - Lambda @ z^{(l + 1)} ''' k = [H[l].shape[0] for l in range(Lc + 1)] Ez_ys = [t(z_s[0], (0, 2, 1))] # E(y | y ,s) = y E_z1z2T_ys = [] E_z2z2T_ys = [] EeeT_ys = [] for l in range(Lc + 1): #print(l) # Broadcast the quantities to the right shape z1_s = t(z_s[l], (0, 2, 1)) z1_s = np.tile(z1_s, (1, Sc[0] // Sc[l], 1))[..., n_axis] z1c_s = np.tile(zc_s[l], (1, np.prod(k[:l]), 1)) z2_s = z_s[l + 1].transpose((0, 2, 1)) z2_s = np.tile(z2_s, (1, np.prod(k[:l + 1]), 1))[..., n_axis] pz1_ys = pz_ys[l - 1][..., n_axis] pz2_ys = pz_ys[l][..., n_axis] H_formated = np.tile(H[l], (np.prod(k[:l]), 1, 1)) H_formated = np.repeat(H_formated, Sc[l + 1], axis=0)[n_axis] #========================================================= # E(z^{l + 1} | z^{l}, s) = sum_M^{l + 1} z^{l + 1} #========================================================= E_z2_z1s = z2_z1s[l].mean(1) E_z2_z1s = np.tile(E_z2_z1s, (1, Sc[0] // Sc[l], 1)) if l == 0: Ez_ys_l = E_z2_z1s else: Ez_ys_l = (pz1_ys * E_z2_z1s[n_axis]).sum(1) Ez_ys.append(Ez_ys_l) #========================================================= # E(z^{l + 1}z^{l + 1}^T | z^{l}, s) #========================================================= E_z2z2T_z1s = (z2_z1s[l][..., n_axis] @ \ np.expand_dims(z2_z1s[l], 3)).mean(1) E_z2z2T_z1s = np.tile(E_z2z2T_z1s, (1, Sc[0] // Sc[l], 1, 1)) #========================================================= # E(z^{l + 1}z^{l + 1}^T | y, s) #========================================================= if l == 0: E_z2z2T_ys.append(E_z2z2T_z1s) else: E_z2z2T_ys_l = (pz1_ys[..., n_axis] * \ E_z2z2T_z1s[n_axis]).sum(1) E_z2z2T_ys.append(E_z2z2T_ys_l) #========================================================= # E(z^{l}z^{l + 1}^T | y, s) #========================================================= if l == 0: # E(y, z^{1} | y, s) = y @ E(z^{1} | y, s) # OK TO REMOVE THE SUM ? E_z1z2T_ys_l = z1_s @ np.expand_dims(Ez_ys_l, axis=2) else: # To check E_z1z2T_ys_l = (pz1_ys[..., n_axis] * (z1_s[n_axis] @ \ np.expand_dims(np.expand_dims(Ez_ys_l, axis = 1), 3))).sum(1) E_z1z2T_ys.append(E_z1z2T_ys_l) #========================================================= # E[((z^l - eta^l) - Lambda z^{l + 1})((z^l - eta^l) - Lambda z^{l + 1})^T | y, s] #========================================================= e = (np.expand_dims(z1c_s, 1) - t(H_formated @ z2_s, (3, 0, 1, 2)))[..., n_axis] eeT = e @ t(e, (0, 1, 2, 4, 3)) if l == 0: EeeT_ys_l = (pz2_ys[..., n_axis] * eeT).sum(1) else: pz1z2_ys = np.expand_dims(pz_ys[l - 1], 2) * pz2_z1s[l][n_axis] pz1z2_ys = pz1z2_ys[..., n_axis, n_axis] EeeT_ys_l = (pz1z2_ys * eeT[n_axis]).sum((1, 2)) EeeT_ys.append(EeeT_ys_l) return Ez_ys, E_z1z2T_ys, E_z2z2T_ys, EeeT_ys