def test_ucla_p_values(self, X_ucla, y_ucla):
        ol = OrderedLogit()
        ol.fit(X_ucla, y_ucla)

        expected_p_values_ = np.array(
            [8.087e-05, 8.435e-01, 1.812e-02, 4.696e-03, 9.027e-08])
        assert_allclose(ol.p_values_, expected_p_values_, rtol=0.01)
    def test_ucla_z_values(self, X_ucla, y_ucla):
        ol = OrderedLogit()
        ol.fit(X_ucla, y_ucla)

        expected_z_values_ = np.array(
            [3.9418, -0.1974, 2.3632, 2.8272, 5.3453])
        assert_allclose(ol._compute_z_values(), expected_z_values_, rtol=0.01)
    def test_ucla_coef(self, X_ucla, y_ucla):
        ol = OrderedLogit()
        ol.fit(X_ucla, y_ucla)

        expected_coef_ = np.array(
            [1.04769, -0.05879, 0.61594, 2.20391, 4.29936])
        assert_allclose(ol.coef_, expected_coef_, rtol=0.01)
    def test_gradient(self):
        ol = OrderedLogit()
        X = np.array([[1.0], [1.0], [1.0]])
        y = np.array([1, 1, 2])
        coefficients = np.array([1.0, 1.0])
        ol.n_attributes = 1
        ol.n_classes = 2
        ol._prepare_y(y)

        expected = np.array([0.5, -0.5])
        assert_array_equal(ol._gradient(coefficients, X, y), expected)
Example #5
0
def dim_reduce_init(y,
                    n_clusters,
                    k,
                    r,
                    nj,
                    var_distrib,
                    use_famd=False,
                    seed=None):
    ''' Perform dimension reduction into a continuous r dimensional space and determine 
    the init coefficients in that space
    
    y (numobs x p ndarray): The observations containing categorical variables
    n_clusters (int): The number of clusters to look for in the data
    k (1d array): The number of components of the latent Gaussian mixture layers
    r (int): The dimension of latent variables
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
    var_distrib (p 1darray): An array containing the types of the variables in y 
    use_famd (Bool): Whether to the famd method (True) or not (False), to initiate the 
                    first continuous latent variable. Otherwise MCA is used.
    seed (None): The random state seed to use for the dimension reduction
    ---------------------------------------------------------------------------------------
    returns (dict): All initialisation parameters
    '''

    L = len(k)
    numobs = len(y)
    S = np.prod(k)

    #==============================================================
    # Dimension reduction performed with MCA
    #==============================================================

    if type(y) != pd.core.frame.DataFrame:
        raise TypeError('y should be a dataframe for prince')

    if (np.array(var_distrib) == 'ordinal').all():
        print('PCA init')

        pca = prince.PCA(n_components = r[0], n_iter=3, rescale_with_mean=True,\
            rescale_with_std=True, copy=True, check_input=True, engine='auto',\
                random_state = seed)
        z1 = pca.fit_transform(y).values

    elif use_famd:
        famd = prince.FAMD(n_components = r[0], n_iter=3, copy=True, check_input=False, \
                               engine='auto', random_state = seed)
        z1 = famd.fit_transform(y).values

    else:
        # Check input = False to remove
        mca = prince.MCA(n_components = r[0], n_iter=3, copy=True,\
                         check_input=False, engine='auto', random_state = seed)
        z1 = mca.fit_transform(y).values

    z = [z1]
    y = y.values

    #==============================================================
    # Set the shape parameters of each data type
    #==============================================================

    y_bin = y[:, np.logical_or(var_distrib == 'bernoulli',\
                               var_distrib == 'binomial')].astype(int)
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',\
                              var_distrib == 'binomial')]
    nb_bin = len(nj_bin)

    y_ord = y[:, var_distrib == 'ordinal'].astype(float).astype(int)
    nj_ord = nj[var_distrib == 'ordinal']
    nb_ord = len(nj_ord)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical']
    nb_categ = len(nj_categ)

    # Set y_count standard error to 1
    y_cont = y[:, var_distrib == 'continuous']

    # Before was np.float
    y_cont = y_cont / np.std(y_cont.astype(float), axis=0, keepdims=True)
    nb_cont = y_cont.shape[1]

    #=======================================================
    # Determining the Gaussian Parameters
    #=======================================================
    init = {}

    eta = []
    H = []
    psi = []
    paths_pred = np.zeros((numobs, L))

    for l in range(L):
        params = get_MFA_params(z[l], k[l], r[l:])
        eta.append(params['eta'][..., n_axis])
        H.append(params['H'])
        psi.append(params['psi'])
        z.append(params['z_nextl'])
        paths_pred[:, l] = params['classes']

    paths, nb_paths = np.unique(paths_pred, return_counts=True, axis=0)
    paths, nb_paths = add_missing_paths(k, paths, nb_paths)

    w_s = nb_paths / numobs
    w_s = np.where(w_s == 0, 1E-16, w_s)

    # Check all paths have been explored
    if len(paths) != S:
        raise RuntimeError('Real path len is', S, 'while the initial number', \
                           'of path was only',  len(paths))

    w_s = w_s.reshape(*k).flatten('C')

    #=============================================================
    # Enforcing identifiability constraints over the first layer
    #=============================================================

    H = diagonal_cond(H, psi)
    Ez, AT = compute_z_moments(w_s, eta, H, psi)
    eta, H, psi = identifiable_estim_DGMM(eta, H, psi, Ez, AT)

    init['eta'] = eta
    init['H'] = H
    init['psi'] = psi

    init['w_s'] = w_s  # Probabilities of each path through the network
    init['z'] = z

    # The clustering layer is the one used to perform the clustering
    # i.e. the layer l such that k[l] == n_clusters
    clustering_layer = np.argmax(np.array(k) == n_clusters)

    init[
        'classes'] = paths_pred[:,
                                clustering_layer]  # 0 To change with clustering_layer_idx

    #=======================================================
    # Determining the coefficients of the GLLVM layer
    #=======================================================

    # Determining lambda_bin coefficients.

    lambda_bin = np.zeros((nb_bin, r[0] + 1))

    for j in range(nb_bin):
        Nj = np.max(y_bin[:, j])  # The support of the jth binomial is [1, Nj]

        if Nj == 1:  # If the variable is Bernoulli not binomial
            yj = y_bin[:, j]
            z_new = z[0]
        else:  # If not, need to convert Binomial output to Bernoulli output
            yj, z_new = bin_to_bern(Nj, y_bin[:, j], z[0])

        lr = LogisticRegression()

        if j < r[0] - 1:
            lr.fit(z_new[:, :j + 1], yj)
            lambda_bin[j, :j + 2] = np.concatenate(
                [lr.intercept_, lr.coef_[0]])
        else:
            lr.fit(z_new, yj)
            lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]])

    ## Identifiability of bin coefficients
    lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT[0][0]

    # Determining lambda_ord coefficients
    lambda_ord = []

    for j in range(nb_ord):
        Nj = len(np.unique(
            y_ord[:, j], axis=0))  # The support of the jth ordinal is [1, Nj]
        yj = y_ord[:, j]

        ol = OrderedLogit()
        ol.fit(z[0], yj)

        ## Identifiability of ordinal coefficients
        beta_j = (ol.beta_.reshape(1, r[0]) @ AT[0][0]).flatten()
        lambda_ord_j = np.concatenate([ol.alpha_, beta_j])
        lambda_ord.append(lambda_ord_j)

    # Determining the coefficients of the continuous variables
    lambda_cont = np.zeros((nb_cont, r[0] + 1))

    for j in range(nb_cont):
        yj = y_cont[:, j]
        linr = LinearRegression()

        if j < r[0] - 1:
            linr.fit(z[0][:, :j + 1], yj)
            lambda_cont[j, :j + 2] = np.concatenate([[linr.intercept_],
                                                     linr.coef_])
        else:
            linr.fit(z[0], yj)
            lambda_cont[j] = np.concatenate([[linr.intercept_], linr.coef_])

    ## Identifiability of continuous coefficients
    lambda_cont[:, 1:] = lambda_cont[:, 1:] @ AT[0][0]

    # Determining lambda_categ coefficients
    lambda_categ = []

    for j in range(nb_categ):
        yj = y_categ[:, j]

        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(z[0], yj)

        ## Identifiability of categ coefficients
        beta_j = lr.coef_ @ AT[0][0]
        lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j]))

    init['lambda_bin'] = lambda_bin
    init['lambda_ord'] = lambda_ord
    init['lambda_cont'] = lambda_cont
    init['lambda_categ'] = lambda_categ

    return init
Example #6
0
def rl1_selection(y_bin, y_ord, y_categ, y_cont, zl1_ys, w_s):
    ''' Selects the number of factors on the first latent discrete layer 
    y_bin (n x p_bin ndarray): The binary and count data matrix
    y_ord (n x p_ord ndarray): The ordinal data matrix
    y_categ (n x p_categ ndarray): The categorical data matrix
    y_cont (n x p_cont ndarray): The continuous data matrix
    zl1_ys (k_1D x r_1D ndarray): The first layer latent variables
    w_s (list): The path probabilities starting from the first layer
    ------------------------------------------------------------------
    return (list of int): The dimensions to keep for the GLLVM layer
    '''
    
    M0 = zl1_ys.shape[0]
    numobs = zl1_ys.shape[1] 
    r0 = zl1_ys.shape[2]
    S0 = zl1_ys.shape[3] 

    nb_bin = y_bin.shape[1]
    nb_ord = y_ord.shape[1]
    nb_categ = y_categ.shape[1]

    nb_cont = y_cont.shape[1]

            
    PROP_ZERO_THRESHOLD = 0.25
    PVALUE_THRESHOLD = 0.10
    
    # Detemine the dimensions that are weakest for Binomial variables
    zero_coef_mask = np.zeros(r0)
    for j in range(nb_bin):
        for s in range(S0):
            Nj = int(np.max(y_bin[:,j])) # The support of the jth binomial is [1, Nj]
            
            if Nj ==  1:  # If the variable is Bernoulli not binomial
                yj = y_bin[:,j]
                z = zl1_ys[:,:,:,s]
            else: # If not, need to convert Binomial output to Bernoulli output
                yj, z = bin_to_bern(Nj, y_bin[:,j], zl1_ys[:,:,:,s])
        
            # Put all the M0 points in a series
            X = z.flatten(order = 'C').reshape((M0 * numobs * Nj, r0), order = 'C')
            y_repeat = np.repeat(yj, M0).astype(int) # Repeat rather than tile to check
            
            lr = LogisticRegression(penalty = 'l1', solver = 'saga')
            lr.fit(X, y_repeat)
            zero_coef_mask += (lr.coef_[0] == 0) * w_s[s]
    
    # Detemine the dimensions that are weakest for Ordinal variables
    for j in range(nb_ord):
        for s in range(S0):
            ol = OrderedLogit()
            X = zl1_ys[:,:,:,s].flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
            y_repeat = np.repeat(y_ord[:, j], M0).astype(int) # Repeat rather than tile to check
            
            ol.fit(X, y_repeat)
            zero_coef_mask += np.array(ol.summary['p'] > PVALUE_THRESHOLD) * w_s[s]
    
    # Detemine the dimensions that are weakest for Categorical variables
    for j in range(nb_categ):
        for s in range(S0):
            z = zl1_ys[:,:,:,s]
                        
            # Put all the M0 points in a series
            X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
            y_repeat = np.repeat(y_categ[:,j], M0).astype(int) # Repeat rather than tile to check
            
            lr = LogisticRegression(penalty = 'l1', solver = 'saga', \
                                    multi_class = 'multinomial')            
            lr.fit(X, y_repeat)  
            
            zero_coef_mask += (lr.coef_[0] == 0) * w_s[s]   
            
    # Detemine the dimensions that are weakest for Continuous variables
    for j in range(nb_cont):
        for s in range(S0):
            z = zl1_ys[:,:,:,s]
                        
            # Put all the M0 points in a series
            X = z.flatten(order = 'C').reshape((M0 * numobs, r0), order = 'C')
            y_repeat = np.repeat(y_cont[:,j], M0) # Repeat rather than tile to check
            
            linr = Lasso()
            linr.fit(X, y_repeat)
            
            #coefs = np.concatenate([[linr.intercept_], linr.coef_])
            #zero_coef_mask += (coefs == 0) * w_s[s]   
            zero_coef_mask += (linr.coef_[0] == 0) * w_s[s]    

            
            
    # Voting: Delete the dimensions which have been zeroed a majority of times 
    zeroed_coeff_prop = zero_coef_mask / ((nb_ord + nb_bin + nb_categ + nb_cont))
    
    # Need at least r1 = 2 for algorithm to work
    new_rl = np.sum(zeroed_coeff_prop <= PROP_ZERO_THRESHOLD)
    
    if new_rl < 2:
        dims_to_keep = np.argsort(zeroed_coeff_prop)[:2]
        
    else:
        dims_to_keep = list(set(range(r0))  - \
                        set(np.where(zeroed_coeff_prop > PROP_ZERO_THRESHOLD)[0].tolist()))
            
    dims_to_keep = np.sort(dims_to_keep)

    return dims_to_keep
Example #7
0
def dim_reduce_init(y, n_clusters, k, r, nj, var_distrib, seed=None):
    ''' Perform dimension reduction into a continuous r dimensional space and determine 
    the init coefficients in that space
    
    y (numobs x p ndarray): The data 
    k (dict of lists): The number of components of each layer of the network
    r (int): The dimensions of the components of each layer of the network
    nj (p 1darray): For binary/count data: The maximum values that the variable can take. 
                    For ordinal data: the number of different existing categories for each variable
                    For categorical data: the number of different existing categories for each variable
    var_distrib (p 1darray): An array containing the types of the variables in y 
    seed (None): The random state seed to use for the dimension reduction
    ---------------------------------------------------------------------------------------
    returns (dict): All initialisation parameters
    '''

    if type(y) != pd.core.frame.DataFrame:
        raise TypeError('y should be a dataframe for prince')

    numobs = len(y)

    # Length of both heads and tail. L, bar_L and S might not be homogeneous
    # with the MDGMM notations
    bar_L = {'c': len(k['c']), 'd': len(k['d'])}
    L = {'c': len(k['c']), 'd': len(k['d']), 't': len(k['t']) - 1}

    # Paths of both heads and tail
    S = {'c': np.prod(k['c']), 'd': np.prod(k['d']), 't': np.prod(k['t'])}

    # Data of both heads
    yc = y.iloc[:, var_distrib == 'continuous'].values
    yd = y.iloc[:, var_distrib != 'continuous'].values

    #==============================================================
    # Dimension reduction performed with MCA on discrete data
    #==============================================================

    # Check input = False to remove
    mca = prince.MCA(n_components = r['d'][0], n_iter=3, copy=True,\
                     check_input=False, engine='auto', random_state = seed)
    z1D = mca.fit_transform(yd.astype(str)).values

    y = y.values

    # Be careful: The first z^c is the continuous data whether the first
    # z^d is the MCA transformed data.

    #==============================================================
    # Set the shape parameters of each discrete data type
    #==============================================================

    y_bin = y[:,
              np.logical_or(var_distrib == 'bernoulli', var_distrib ==
                            'binomial')]
    y_bin = y_bin.astype(int)
    nj_bin = nj[np.logical_or(var_distrib == 'bernoulli',
                              var_distrib == 'binomial')]
    nb_bin = len(nj_bin)

    y_categ = y[:, var_distrib == 'categorical']
    nj_categ = nj[var_distrib == 'categorical']
    nb_categ = len(nj_categ)

    y_ord = y[:, var_distrib == 'ordinal']
    y_ord = y_ord.astype(int)
    nj_ord = nj[var_distrib == 'ordinal']
    nb_ord = len(nj_ord)

    ss = StandardScaler()
    yc = ss.fit_transform(yc)

    #=======================================================
    # Determining the Gaussian Parameters
    #=======================================================
    init = {}

    # Initialise both heads quantities
    eta_d, H_d, psi_d, zd, paths_pred_d = init_head(z1D, k['d'], r['d'],
                                                    numobs, L['d'])
    eta_c, H_c, psi_c, zc, paths_pred_c = init_head(yc, k['c'], r['c'], numobs,
                                                    L['c'])

    # Initialisation of the common layer. The coefficients are those between the last
    # Layer of both heads and the first junction layer
    eta_h_last, H_h_last, psi_h_last, paths_pred_h_last, zt_first = init_junction_layer(
        r, k, zc, zd)
    eta_d.append(eta_h_last['d'])
    H_d.append(H_h_last['d'])
    psi_d.append(psi_h_last['d'])

    eta_c.append(eta_h_last['c'])
    H_c.append(H_h_last['c'])
    psi_c.append(psi_h_last['c'])

    paths_pred_d.append(paths_pred_h_last['d'])
    paths_pred_c.append(paths_pred_h_last['c'])
    zt = [zt_first]

    # Initialisation of the following common layers
    for l in range(L['t']):
        params = get_MFA_params(zt[l], k['t'][l], r['t'][l:])
        eta_c.append(params['eta'][..., n_axis])
        eta_d.append(params['eta'][..., n_axis])

        H_c.append(params['H'])
        H_d.append(params['H'])

        psi_c.append(params['psi'])
        psi_d.append(params['psi'])

        zt.append(params['z_nextl'])
        zc.append(params['z_nextl'])
        zd.append(params['z_nextl'])

        paths_pred_c.append(params['classes'])
        paths_pred_d.append(params['classes'])

    paths_pred_c = np.stack(paths_pred_c).T
    paths_c, nb_paths_c = np.unique(paths_pred_c, return_counts=True, axis=0)
    paths_c, nb_paths_c = add_missing_paths(k['c'] + k['t'][:-1], paths_c,
                                            nb_paths_c)

    paths_pred_d = np.stack(paths_pred_d).T
    paths_d, nb_paths_d = np.unique(paths_pred_d, return_counts=True, axis=0)
    paths_d, nb_paths_d = add_missing_paths(k['d'] + k['t'][:-1], paths_d,
                                            nb_paths_d)

    w_s_c = nb_paths_c / numobs
    w_s_c = np.where(w_s_c == 0, 1E-16, w_s_c)

    w_s_d = nb_paths_d / numobs
    w_s_d = np.where(w_s_d == 0, 1E-16, w_s_d)

    k_dt = np.concatenate([k['d'] + k['t']])
    w_s_t = w_s_d.reshape(*k_dt, order='C').sum(tuple(range(L['d'])))
    w_s_t = w_s_t.reshape(-1, order='C')

    # Check that all paths have been explored
    if (len(paths_c) != S['c'] * S['t']) | (len(paths_d) != S['d'] * S['t']):
        raise RuntimeError('Path initialisation failed')

    #=============================================================
    # Enforcing identifiability constraints over the first layer
    #=============================================================

    eta_d, H_d, psi_d, AT_d, eta_c, H_c, psi_c, AT_c = network_identifiability(eta_d, \
                    H_d, psi_d, eta_c, H_c, psi_c, w_s_c, w_s_d, w_s_t, bar_L)

    init['c'] = {}
    init['c']['eta'] = eta_c
    init['c']['H'] = H_c
    init['c']['psi'] = psi_c
    init['c']['w_s'] = w_s_c  # Probabilities of each path through the network
    init['c']['z'] = zc

    init['d'] = {}
    init['d']['eta'] = eta_d
    init['d']['H'] = H_d
    init['d']['psi'] = psi_d
    init['d']['w_s'] = w_s_d  # Probabilities of each path through the network
    init['d']['z'] = zd

    # The clustering layer is the one used to perform the clustering
    # i.e. the layer l such that k[l] == n_clusters
    if not (isnumeric(n_clusters)):
        if n_clusters == 'auto':
            #n_clusters = k['t'][0]
            # First tail layer is the default clustering layer in auto mode
            clustering_layer = L['c']

        elif n_clusters == 'multi':
            clustering_layer = range(L['t'])

        else:
            raise ValueError(
                'Please enter an int, auto or multi for n_clusters')
    else:
        kc_complete = k['c'] + k['t'][:-1]
        common_clus_layer_idx = (np.array(kc_complete) == n_clusters)
        common_clus_layer_idx[:L['c']] = False
        clustering_layer = np.argmax(common_clus_layer_idx)

        assert clustering_layer >= L['c']

    init['classes'] = paths_pred_c[:, clustering_layer]

    #=======================================================
    # Determining the coefficients of the GLLVM layer
    #=======================================================

    # Determining lambda_bin coefficients.
    lambda_bin = np.zeros((nb_bin, r['d'][0] + 1))

    for j in range(nb_bin):
        Nj = int(np.max(
            y_bin[:, j]))  # The support of the jth binomial is [1, Nj]

        if Nj == 1:  # If the variable is Bernoulli not binomial
            yj = y_bin[:, j]
            z_new = zd[0]
        else:  # If not, need to convert Binomial output to Bernoulli output
            yj, z_new = bin_to_bern(Nj, y_bin[:, j], zd[0])

        lr = LogisticRegression()

        if j < r['d'][0] - 1:
            lr.fit(z_new[:, :j + 1], yj)
            lambda_bin[j, :j + 2] = np.concatenate(
                [lr.intercept_, lr.coef_[0]])
        else:
            lr.fit(z_new, yj)
            lambda_bin[j] = np.concatenate([lr.intercept_, lr.coef_[0]])

    ## Identifiability of bin coefficients
    lambda_bin[:, 1:] = lambda_bin[:, 1:] @ AT_d[0][0]

    # Determining lambda_ord coefficients
    lambda_ord = []

    for j in range(nb_ord):
        #Nj = len(np.unique(y_ord[:,j], axis = 0))  # The support of the jth ordinal is [1, Nj]
        yj = y_ord[:, j]

        ol = OrderedLogit()
        ol.fit(zd[0], yj)

        ## Identifiability of ordinal coefficients
        beta_j = (ol.beta_.reshape(1, r['d'][0]) @ AT_d[0][0]).flatten()
        lambda_ord_j = np.concatenate([ol.alpha_, beta_j])
        lambda_ord.append(lambda_ord_j)

    # Determining lambda_categ coefficients
    lambda_categ = []

    for j in range(nb_categ):
        yj = y_categ[:, j]

        lr = LogisticRegression(multi_class='multinomial')
        lr.fit(zd[0], yj)

        ## Identifiability of categ coefficients
        beta_j = lr.coef_ @ AT_d[0][0]
        lambda_categ.append(np.hstack([lr.intercept_[..., n_axis], beta_j]))

    init['lambda_bin'] = lambda_bin
    init['lambda_ord'] = lambda_ord
    init['lambda_categ'] = lambda_categ

    return init
def sample_lor():
    lor = OrderedLogit(significance=0.9)
    lor.alpha_ = np.array([1, 1])
    lor.beta_ = np.array([1, 1, 1])
    lor._y_dict = {1: 1, 2: 2, 3: 7}
    lor.n_attributes = 3
    lor.n_classes = 3
    lor.N = 10
    lor.score_ = 0.9
    lor.se_ = np.array([1, 1, 1, 1, 1])
    lor.p_values_ = np.array([0, 0, 0, 0, 0])
    lor.attribute_names = pd.DataFrame(
        ['attribute_1', 'attribute_2', 'attribute_3'],
        columns=['attribute names'])
    return lor
    def test_ucla_se(self, X_ucla, y_ucla):
        ol = OrderedLogit()
        ol.fit(X_ucla, y_ucla)

        expected_se_ = np.array([0.2658, 0.2979, 0.2606, 0.7795, 0.8043])
        assert_allclose(ol.se_, expected_se_, rtol=0.01)