def test_ucla_p_values(self, X_ucla, y_ucla): ol = OrderedLogit() ol.fit(X_ucla, y_ucla) expected_p_values_ = np.array( [8.087e-05, 8.435e-01, 1.812e-02, 4.696e-03, 9.027e-08]) assert_allclose(ol.p_values_, expected_p_values_, rtol=0.01)
def test_ucla_z_values(self, X_ucla, y_ucla): ol = OrderedLogit() ol.fit(X_ucla, y_ucla) expected_z_values_ = np.array( [3.9418, -0.1974, 2.3632, 2.8272, 5.3453]) assert_allclose(ol._compute_z_values(), expected_z_values_, rtol=0.01)
def test_ucla_coef(self, X_ucla, y_ucla): ol = OrderedLogit() ol.fit(X_ucla, y_ucla) expected_coef_ = np.array( [1.04769, -0.05879, 0.61594, 2.20391, 4.29936]) assert_allclose(ol.coef_, expected_coef_, rtol=0.01)
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, use_famd=False, seed=None): ''' Perform dimension reduction into a continuous r dimensional space and determine the init coefficients in that space y (numobs x p ndarray): The observations containing categorical variables n_clusters (int): The number of clusters to look for in the data k (1d array): The number of components of the latent Gaussian mixture layers r (int): The dimension of latent variables nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable var_distrib (p 1darray): An array containing the types of the variables in y use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the first continuous latent variable. Otherwise MCA is used. seed (None): The random state seed to use for the dimension reduction --------------------------------------------------------------------------------------- returns (dict): All initialisation parameters ''' L = len(k) numobs = len(y) S = np.prod(k) #============================================================== # Dimension reduction performed with MCA #============================================================== if type(y) != pd.core.frame.DataFrame: raise TypeError('y should be a dataframe for prince') if (np.array(var_distrib) == 'ordinal').all(): print('PCA init') pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\ rescale_with_std=True, copy=True, check_input=True, engine='auto',\ random_state = seed) z1 = pca.fit_transform(y).values elif use_famd: famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \ engine='auto', random_state = seed) z1 = famd.fit_transform(y).values else: # Check input = False to remove mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\ check_input=False, engine='auto', random_state = seed) z1 = mca.fit_transform(y).values z = [z1] y = y.values #============================================================== # Set the shape parameters of each data type #============================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')].astype(int) nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\ var_distrib == 'binomial')] nb_bin = len(nj_bin) y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int) nj_ord = nj[var_distrib == 'ordinal'] nb_ord = len(nj_ord) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'] nb_categ = len(nj_categ) # Set y_count standard error to 1 y_cont = y[:, var_distrib == 'continuous'] # Before was np.float y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True) nb_cont = y_cont.shape[1] #======================================================= # Determining the Gaussian Parameters #======================================================= init = {} eta = [] H = [] psi = [] paths_pred = np.zeros((numobs, L)) for l in range(L): params = get_MFA_params(z[l], k[l], r[l:]) eta.append(params['eta'][..., n_axis]) H.append(params['H']) psi.append(params['psi']) z.append(params['z_nextl']) paths_pred[:, l] = params['classes'] paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0) paths, nb_paths = add_missing_paths(k, paths, nb_paths) w_s = nb_paths / numobs w_s = np.where(w_s == 0, 1E-16, w_s) # Check all paths have been explored if len(paths) != S: raise RuntimeError('Real path len is', S, 'while the initial number', \ 'of path was only', len(paths)) w_s = w_s.reshape(*k).flatten('C') #============================================================= # Enforcing identifiability constraints over the first layer #============================================================= H = diagonal_cond(H, psi) Ez, AT = compute_z_moments(w_s, eta, H, psi) eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT) init['eta'] = eta init['H'] = H init['psi'] = psi init['w_s'] = w_s # Probabilities of each path through the network init['z'] = z # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters clustering_layer = np.argmax(np.array(k) == n_clusters) init[ 'classes'] = paths_pred[:, clustering_layer] # 0 To change with clustering_layer_idx #======================================================= # Determining the coefficients of the GLLVM layer #======================================================= # Determining lambda_bin coefficients. lambda_bin = np.zeros((nb_bin, r[0] + 1)) for j in range(nb_bin): Nj = np.max(y_bin[:, j]) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:, j] z_new = z[0] else: # If not, need to convert Binomial output to Bernoulli output yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0]) lr = LogisticRegression() if j < r[0] - 1: lr.fit(z_new[:, :j + 1], yj) lambda_bin[j, :j + 2] = np.concatenate( [lr.intercept_, lr.coef_[0]]) else: lr.fit(z_new, yj) lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]]) ## Identifiability of bin coefficients lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0] # Determining lambda_ord coefficients lambda_ord = [] for j in range(nb_ord): Nj = len(np.unique( y_ord[:, j], axis=0)) # The support of the jth ordinal is [1, Nj] yj = y_ord[:, j] ol = OrderedLogit() ol.fit(z[0], yj) ## Identifiability of ordinal coefficients beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten() lambda_ord_j = np.concatenate([ol.alpha_, beta_j]) lambda_ord.append(lambda_ord_j) # Determining the coefficients of the continuous variables lambda_cont = np.zeros((nb_cont, r[0] + 1)) for j in range(nb_cont): yj = y_cont[:, j] linr = LinearRegression() if j < r[0] - 1: linr.fit(z[0][:, :j + 1], yj) lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_], linr.coef_]) else: linr.fit(z[0], yj) lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_]) ## Identifiability of continuous coefficients lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0] # Determining lambda_categ coefficients lambda_categ = [] for j in range(nb_categ): yj = y_categ[:, j] lr = LogisticRegression(multi_class='multinomial') lr.fit(z[0], yj) ## Identifiability of categ coefficients beta_j = lr.coef_ @ AT[0][0] lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j])) init['lambda_bin'] = lambda_bin init['lambda_ord'] = lambda_ord init['lambda_cont'] = lambda_cont init['lambda_categ'] = lambda_categ return init
def rl1_selection(y_bin, y_ord, y_categ, y_cont, zl1_ys, w_s): ''' Selects the number of factors on the first latent discrete layer y_bin (n x p_bin ndarray): The binary and count data matrix y_ord (n x p_ord ndarray): The ordinal data matrix y_categ (n x p_categ ndarray): The categorical data matrix y_cont (n x p_cont ndarray): The continuous data matrix zl1_ys (k_1D x r_1D ndarray): The first layer latent variables w_s (list): The path probabilities starting from the first layer ------------------------------------------------------------------ return (list of int): The dimensions to keep for the GLLVM layer ''' M0 = zl1_ys.shape[0] numobs = zl1_ys.shape[1] r0 = zl1_ys.shape[2] S0 = zl1_ys.shape[3] nb_bin = y_bin.shape[1] nb_ord = y_ord.shape[1] nb_categ = y_categ.shape[1] nb_cont = y_cont.shape[1] PROP_ZERO_THRESHOLD = 0.25 PVALUE_THRESHOLD = 0.10 # Detemine the dimensions that are weakest for Binomial variables zero_coef_mask = np.zeros(r0) for j in range(nb_bin): for s in range(S0): Nj = int(np.max(y_bin[:,j])) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:,j] z = zl1_ys[:,:,:,s] else: # If not, need to convert Binomial output to Bernoulli output yj, z = bin_to_bern(Nj, y_bin[:,j], zl1_ys[:,:,:,s]) # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs * Nj, r0), order = 'C') y_repeat = np.repeat(yj, M0).astype(int) # Repeat rather than tile to check lr = LogisticRegression(penalty = 'l1', solver = 'saga') lr.fit(X, y_repeat) zero_coef_mask += (lr.coef_[0] == 0) * w_s[s] # Detemine the dimensions that are weakest for Ordinal variables for j in range(nb_ord): for s in range(S0): ol = OrderedLogit() X = zl1_ys[:,:,:,s].flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_ord[:, j], M0).astype(int) # Repeat rather than tile to check ol.fit(X, y_repeat) zero_coef_mask += np.array(ol.summary['p'] > PVALUE_THRESHOLD) * w_s[s] # Detemine the dimensions that are weakest for Categorical variables for j in range(nb_categ): for s in range(S0): z = zl1_ys[:,:,:,s] # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_categ[:,j], M0).astype(int) # Repeat rather than tile to check lr = LogisticRegression(penalty = 'l1', solver = 'saga', \ multi_class = 'multinomial') lr.fit(X, y_repeat) zero_coef_mask += (lr.coef_[0] == 0) * w_s[s] # Detemine the dimensions that are weakest for Continuous variables for j in range(nb_cont): for s in range(S0): z = zl1_ys[:,:,:,s] # Put all the M0 points in a series X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C') y_repeat = np.repeat(y_cont[:,j], M0) # Repeat rather than tile to check linr = Lasso() linr.fit(X, y_repeat) #coefs = np.concatenate([[linr.intercept_], linr.coef_]) #zero_coef_mask += (coefs == 0) * w_s[s] zero_coef_mask += (linr.coef_[0] == 0) * w_s[s] # Voting: Delete the dimensions which have been zeroed a majority of times zeroed_coeff_prop = zero_coef_mask / ((nb_ord + nb_bin + nb_categ + nb_cont)) # Need at least r1 = 2 for algorithm to work new_rl = np.sum(zeroed_coeff_prop <= PROP_ZERO_THRESHOLD) if new_rl < 2: dims_to_keep = np.argsort(zeroed_coeff_prop)[:2] else: dims_to_keep = list(set(range(r0)) - \ set(np.where(zeroed_coeff_prop > PROP_ZERO_THRESHOLD)[0].tolist())) dims_to_keep = np.sort(dims_to_keep) return dims_to_keep
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, seed=None): ''' Perform dimension reduction into a continuous r dimensional space and determine the init coefficients in that space y (numobs x p ndarray): The data k (dict of lists): The number of components of each layer of the network r (int): The dimensions of the components of each layer of the network nj (p 1darray): For binary/count data: The maximum values that the variable can take. For ordinal data: the number of different existing categories for each variable For categorical data: the number of different existing categories for each variable var_distrib (p 1darray): An array containing the types of the variables in y seed (None): The random state seed to use for the dimension reduction --------------------------------------------------------------------------------------- returns (dict): All initialisation parameters ''' if type(y) != pd.core.frame.DataFrame: raise TypeError('y should be a dataframe for prince') numobs = len(y) # Length of both heads and tail. L, bar_L and S might not be homogeneous # with the MDGMM notations bar_L = {'c': len(k['c']), 'd': len(k['d'])} L = {'c': len(k['c']), 'd': len(k['d']), 't': len(k['t']) - 1} # Paths of both heads and tail S = {'c': np.prod(k['c']), 'd': np.prod(k['d']), 't': np.prod(k['t'])} # Data of both heads yc = y.iloc[:, var_distrib == 'continuous'].values yd = y.iloc[:, var_distrib != 'continuous'].values #============================================================== # Dimension reduction performed with MCA on discrete data #============================================================== # Check input = False to remove mca = prince.MCA(n_components = r['d'][0], n_iter=3, copy=True,\ check_input=False, engine='auto', random_state = seed) z1D = mca.fit_transform(yd.astype(str)).values y = y.values # Be careful: The first z^c is the continuous data whether the first # z^d is the MCA transformed data. #============================================================== # Set the shape parameters of each discrete data type #============================================================== y_bin = y[:, np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')] y_bin = y_bin.astype(int) nj_bin = nj[np.logical_or(var_distrib == 'bernoulli', var_distrib == 'binomial')] nb_bin = len(nj_bin) y_categ = y[:, var_distrib == 'categorical'] nj_categ = nj[var_distrib == 'categorical'] nb_categ = len(nj_categ) y_ord = y[:, var_distrib == 'ordinal'] y_ord = y_ord.astype(int) nj_ord = nj[var_distrib == 'ordinal'] nb_ord = len(nj_ord) ss = StandardScaler() yc = ss.fit_transform(yc) #======================================================= # Determining the Gaussian Parameters #======================================================= init = {} # Initialise both heads quantities eta_d, H_d, psi_d, zd, paths_pred_d = init_head(z1D, k['d'], r['d'], numobs, L['d']) eta_c, H_c, psi_c, zc, paths_pred_c = init_head(yc, k['c'], r['c'], numobs, L['c']) # Initialisation of the common layer. The coefficients are those between the last # Layer of both heads and the first junction layer eta_h_last, H_h_last, psi_h_last, paths_pred_h_last, zt_first = init_junction_layer( r, k, zc, zd) eta_d.append(eta_h_last['d']) H_d.append(H_h_last['d']) psi_d.append(psi_h_last['d']) eta_c.append(eta_h_last['c']) H_c.append(H_h_last['c']) psi_c.append(psi_h_last['c']) paths_pred_d.append(paths_pred_h_last['d']) paths_pred_c.append(paths_pred_h_last['c']) zt = [zt_first] # Initialisation of the following common layers for l in range(L['t']): params = get_MFA_params(zt[l], k['t'][l], r['t'][l:]) eta_c.append(params['eta'][..., n_axis]) eta_d.append(params['eta'][..., n_axis]) H_c.append(params['H']) H_d.append(params['H']) psi_c.append(params['psi']) psi_d.append(params['psi']) zt.append(params['z_nextl']) zc.append(params['z_nextl']) zd.append(params['z_nextl']) paths_pred_c.append(params['classes']) paths_pred_d.append(params['classes']) paths_pred_c = np.stack(paths_pred_c).T paths_c, nb_paths_c = np.unique(paths_pred_c, return_counts=True, axis=0) paths_c, nb_paths_c = add_missing_paths(k['c'] + k['t'][:-1], paths_c, nb_paths_c) paths_pred_d = np.stack(paths_pred_d).T paths_d, nb_paths_d = np.unique(paths_pred_d, return_counts=True, axis=0) paths_d, nb_paths_d = add_missing_paths(k['d'] + k['t'][:-1], paths_d, nb_paths_d) w_s_c = nb_paths_c / numobs w_s_c = np.where(w_s_c == 0, 1E-16, w_s_c) w_s_d = nb_paths_d / numobs w_s_d = np.where(w_s_d == 0, 1E-16, w_s_d) k_dt = np.concatenate([k['d'] + k['t']]) w_s_t = w_s_d.reshape(*k_dt, order='C').sum(tuple(range(L['d']))) w_s_t = w_s_t.reshape(-1, order='C') # Check that all paths have been explored if (len(paths_c) != S['c'] * S['t']) | (len(paths_d) != S['d'] * S['t']): raise RuntimeError('Path initialisation failed') #============================================================= # Enforcing identifiability constraints over the first layer #============================================================= eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \ H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L) init['c'] = {} init['c']['eta'] = eta_c init['c']['H'] = H_c init['c']['psi'] = psi_c init['c']['w_s'] = w_s_c # Probabilities of each path through the network init['c']['z'] = zc init['d'] = {} init['d']['eta'] = eta_d init['d']['H'] = H_d init['d']['psi'] = psi_d init['d']['w_s'] = w_s_d # Probabilities of each path through the network init['d']['z'] = zd # The clustering layer is the one used to perform the clustering # i.e. the layer l such that k[l] == n_clusters if not (isnumeric(n_clusters)): if n_clusters == 'auto': #n_clusters = k['t'][0] # First tail layer is the default clustering layer in auto mode clustering_layer = L['c'] elif n_clusters == 'multi': clustering_layer = range(L['t']) else: raise ValueError( 'Please enter an int, auto or multi for n_clusters') else: kc_complete = k['c'] + k['t'][:-1] common_clus_layer_idx = (np.array(kc_complete) == n_clusters) common_clus_layer_idx[:L['c']] = False clustering_layer = np.argmax(common_clus_layer_idx) assert clustering_layer >= L['c'] init['classes'] = paths_pred_c[:, clustering_layer] #======================================================= # Determining the coefficients of the GLLVM layer #======================================================= # Determining lambda_bin coefficients. lambda_bin = np.zeros((nb_bin, r['d'][0] + 1)) for j in range(nb_bin): Nj = int(np.max( y_bin[:, j])) # The support of the jth binomial is [1, Nj] if Nj == 1: # If the variable is Bernoulli not binomial yj = y_bin[:, j] z_new = zd[0] else: # If not, need to convert Binomial output to Bernoulli output yj, z_new = bin_to_bern(Nj, y_bin[:, j], zd[0]) lr = LogisticRegression() if j < r['d'][0] - 1: lr.fit(z_new[:, :j + 1], yj) lambda_bin[j, :j + 2] = np.concatenate( [lr.intercept_, lr.coef_[0]]) else: lr.fit(z_new, yj) lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]]) ## Identifiability of bin coefficients lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT_d[0][0] # Determining lambda_ord coefficients lambda_ord = [] for j in range(nb_ord): #Nj = len(np.unique(y_ord[:,j], axis = 0)) # The support of the jth ordinal is [1, Nj] yj = y_ord[:, j] ol = OrderedLogit() ol.fit(zd[0], yj) ## Identifiability of ordinal coefficients beta_j = (ol.beta_.reshape(1, r['d'][0]) @ AT_d[0][0]).flatten() lambda_ord_j = np.concatenate([ol.alpha_, beta_j]) lambda_ord.append(lambda_ord_j) # Determining lambda_categ coefficients lambda_categ = [] for j in range(nb_categ): yj = y_categ[:, j] lr = LogisticRegression(multi_class='multinomial') lr.fit(zd[0], yj) ## Identifiability of categ coefficients beta_j = lr.coef_ @ AT_d[0][0] lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j])) init['lambda_bin'] = lambda_bin init['lambda_ord'] = lambda_ord init['lambda_categ'] = lambda_categ return init
def test_ucla_se(self, X_ucla, y_ucla): ol = OrderedLogit() ol.fit(X_ucla, y_ucla) expected_se_ = np.array([0.2658, 0.2979, 0.2606, 0.7795, 0.8043]) assert_allclose(ol.se_, expected_se_, rtol=0.01)