Beispiel #1
0
def draw_random_binary(n,A):
    ''' If p is the size of the square, lower triangular A, creates a n by p matrix of random binary vectors using Schafer's method '''
    m,p = A.shape
    ones   = np.ones((n,1))
    output = np.empty((n,p))
    for i in np.arange(0,p):
        output[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((output[:,0:i],ones)),A[i,0:(i+1)])))
    return output
def ising_X(p,n,A_base_diag=-1,A_sd=.2):
    """ generate X from ising model """
    A = npran.normal(0,A_sd,(p,p))+np.diag(A_base_diag*np.ones(p))
    m,p = A.shape
    ones   = np.ones((n,1))
    X = np.empty((n,p))
    for i in np.arange(0,p):
        X[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((X[:,0:i],ones)),A[i,0:(i+1)])))
    return X
Beispiel #3
0
def ising_X(p, n, A_base_diag=-1, A_sd=.2):
    """ generate X from ising model """
    A = npran.normal(0, A_sd, (p, p)) + np.diag(A_base_diag * np.ones(p))
    m, p = A.shape
    ones = np.ones((n, 1))
    X = np.empty((n, p))
    for i in np.arange(0, p):
        X[:, i] = npran.binomial(
            1, invlogit(np.dot(np.hstack((X[:, 0:i], ones)), A[i, 0:(i + 1)])))
    return X
Beispiel #4
0
def bern_y(X, p1, base_prob=.25, beta_sd=1):
    n, p = X.shape
    X_1 = X[:, :p1]
    v = 0
    while v < 1E-5:
        beta = npran.randn(p1) * beta_sd
        if p1 > 0:
            eta = cutoff(np.dot(X_1, beta) + logit(base_prob))
            y = npran.binomial(1, invlogit(eta), n)
        else:
            y = npran.binomial(1, base_prob, n)
        v = np.min(nplin.svd(np.hstack((X, y[:, np.newaxis])))[1])
    return y
def bern_y(X,p1,base_prob=.25,beta_sd=1):
    n,p = X.shape
    X_1    = X[:,:p1]
    v = 0 
    while v<1E-5:
        beta   = npran.randn(p1)*beta_sd
        if p1>0:
            eta    = cutoff(np.dot(X_1,beta)+logit(base_prob))
            y      = npran.binomial(1,invlogit(eta),n)
        else:
            y      = npran.binomial(1,base_prob,n)
        v = np.min(nplin.svd(np.hstack((X,y[:,np.newaxis])))[1])
    return y
Beispiel #6
0
def genXy_bern_X_norm_beta(seed,n,p1,pnull,x_prob=.25,base_prob=.25,beta_sd=1):
    """ The X are normal. p1 predictive vars, pnull null vars. beta on the p1 vars is ~normal(0,beta_sd) and the intercept is logit(base_prob)"""
    if not seed == None:
        npran.seed(seed)
    X_1    = npran.binomial(1,x_prob,(n,p1))
    X_null = npran.binomial(1,x_prob,(n,pnull))
    X      = np.concatenate((X_1,X_null),axis=1)
    beta   = npran.randn(p1)*beta_sd
    if p1>0:
        eta    = cutoff(np.dot(X_1,beta)+logit(base_prob))
        y      = npran.binomial(1,invlogit(eta),n)
    else:
        y      = npran.binomial(1,base_prob,n)
    return X,y
Beispiel #7
0
def genXy_binary_X_norm_beta(seed,n,p1,pnull,base_prob=.25,beta_sd=1,A_base_diag=-1,A_sd=.2):
    ''' X is binary from the isling model, with the coefficients drawn from a normal. Y is binary, with beta's coefficients also from a normal '''
    if not seed == None:
        npran.seed(seed)
    p = p1 + pnull
    A = npran.normal(0,.2,(p,p))-np.diag(A_base_diag*np.ones(p))
    X = draw_random_binary(n,A)
    X_1    = X[:,:p1]
    X_null = X[:,p1:]
    beta   = npran.randn(p1)*beta_sd
    if p1>0:
        eta    = cutoff(np.dot(X_1,beta)+logit(base_prob))
        y      = npran.binomial(1,invlogit(eta),n)
    else:
        y      = npran.binomial(1,base_prob,n)
    return X,y
Beispiel #8
0
def genXy_given_X_norm_beta(seed,data,n,p1,pnull,base_prob=.25,beta_sd=1):
    ''' X is binary from the isling model, with the coefficients drawn from a normal. Y is binary, with beta's coefficients also from a normal '''
    if not seed == None:
        npran.seed(seed)
    p = p1 + pnull
    h,w = data.shape
    rows = npran.choice(h,n)
    X      = data[rows,:][:,npran.choice(w,p)]
    X_1    = X[:,:p1]
    X_null = X[:,p1:]
    beta   = npran.randn(p1)*beta_sd
    if p1>0:
        eta    = cutoff(np.dot(X_1,beta)+logit(base_prob))
        y      = npran.binomial(1,invlogit(eta),n)
    else:
        y      = npran.binomial(1,base_prob,n)
    return X,y
Beispiel #9
0
def dinvlogit(x):
    '''Derivative of logit function at each point of vextor x'''
    return invlogit(x) * (1 - invlogit(x))
Beispiel #10
0
    def _binary_knockoff(self):
        ''' This creates the new binary knockoffs, which are random multivariate bernoulli which should have, in expectation,
        the same first two moments as X. Only will work if X is all binaray '''

        self._derive_crossmoments()

        ####################################################
        # Get the data corresponding to the original x for the simulations
        ####################################################

        A = np.zeros((2 * self.p, 2 * self.p))

        # Simulate fresh x based on the original data
        if self.method == 'fresh_sim':
            # Fit the upperhalf of A on the actual data, making this easier
            A[1, 1] = logit(self.mu_lrg[0])
            for i in np.arange(0, self.p):
                # inject the paramters from the logit X_i ~ X_1 + ... + X_(i-1) + 1 into the ith row of A
                A[i, 0:(i + 1)] = sm.GLM(
                    self.X_orig[:, i],
                    np.hstack((self.X_orig[:, 0:i], np.ones((self.n, 1)))),
                    family=sm.families.Binomial()).fit().params

            # Then draw the X
            X_fix = draw_random_binary(self.MCsize, A[:self.p, :self.p])
            nMC = self.MCsize

        # just repeat X a bunch of times
        elif self.method == 'bootstrap':
            #Rather than simulate entirely new X at each stage, I will used a fixed set of X_1 ... X_(i-1)
            #This definitely makes sense for the orignal X vars (why simulate when we already have it), but possibly less sense for the knockoffs
            #To get the desired size of montecarlo simulation, I will replicate X until it has at least self.MCsize rows
            repl = np.min((self.MCsize // self.n, 1))
            X_fix = np.repeat(self.X_orig, repl, 0)
            nMC = X_fix.shape[0]

        elif self.method == 'approx':
            X_fix = self.X_orig

        ###################################################
        # Derive remaing A from Newton-Raphson
        ###################################################

        if self.method == 'approx':
            covinv = np.diag(self.M - np.outer(self.mu_lrg, self.mu_lrg))**-1
            upwt = 1
            X_tmp = np.hstack((self.X_orig, np.ones((self.n, 1)))).T / self.n
            for i in np.arange(self.p, 2 * self.p):
                m = np.append(self.M[i, 0:self.p], self.M[i, i])
                wt = np.diag(
                    np.append(np.append(np.ones(i - self.p), upwt),
                              np.ones(2 * self.p - i))) * np.diag(
                                  np.append(covinv[:self.p], covinv[i]))

                ps = cvx.Variable(self.n)
                objective = cvx.Minimize(cvx.norm(wt * (X_tmp * ps - m), 2))
                constraints = [0 <= ps, ps <= 1]
                prob = cvx.Problem(objective, constraints)
                prob.solve(solver=cvx.SCS, max_iters=100)

                X_fix = np.hstack((X_fix, ps.value))

            for j in range(2):
                # make sure order of variables is mixed up
                for i in np.arange(self.p,
                                   2 * self.p)[npran.permutation(self.p)]:
                    X_tmp = X_fix
                    X_tmp[:, i] = np.ones((self.n, 1))
                    m = self.M[i, :]
                    wt = np.diag(
                        np.append(
                            np.append(np.ones(i - self.p), upwt),
                            np.ones(3 * self.p - i - 1))) * np.diag(covinv)

                    ps = cvx.Variable(self.n)
                    objective = cvx.Minimize(
                        cvx.norm(wt * (X_tmp.T / self.n * ps - m), 2))
                    constraints = [0 <= ps, ps <= 1]
                    prob = cvx.Problem(objective, constraints)
                    prob.solve(solver=cvx.SCS, max_iters=100)

                    X_fix[:, i] = ps.value

                    # draw the actual responses
                    X_fix[:,
                          self.p:] = npran.binomial(1, cutoff(X_fix[:,
                                                                    self.p:]))

        if not self.method == 'approx':
            # Largely from 5.1 in Schafer, including notation

            # the current value and the derivatves are derived by simulation

            # sequence of portions between 0 and 1 to deal with case of ill conditioned hessian
            por_seq = np.arange(0, 1, .25)
            self.por = np.empty((1, 0))

            for i in np.arange(self.p, 2 * self.p):
                # Now, the Newton-Raphson steps
                # If the hessian becomes singular, we will relax the cross moment requirements, as described in Schafer 5.1 point 2
                # the idea is that the problem is relaxed until X_i is independent of all prior vars
                # as por increases, covariance drops
                # a is the row we are adding to A. Initialize with values as if independent all other vars

                a = np.append(np.zeros(i), logit(self.mu_lrg[i]))
                X_fix = np.hstack((X_fix, np.ones((nMC, 1))))

                for por in por_seq:
                    # m are the cross moments we are trying to fit
                    m = (1 - por) * self.M[i, 0:(i + 1)] + por * self.M[
                        i, i] * np.append(np.diag(self.M)[0:i], 1)

                    # Minimize the actual difference vector
                    opt = root(self._vector_objective,
                               a,
                               args=(X_fix, m),
                               method='anderson',
                               options={
                                   'maxiter': (i * 2 + 150),
                                   'fatol': 1E-5,
                                   'jac_options': {
                                       'M': 20
                                   }
                               })

                    # update a to most recent estimate, even without convergence
                    a = opt.x

                    # Stop once optimal has been reached
                    if opt.success:
                        self.por = np.append(self.por, por)
                        if por > 0:
                            print "Variable %d relaxed by tau=%.2f" % (
                                i - self.p + 1, por)
                        break

                if not opt.success:
                    a = np.append(np.zeros(i), logit(self.mu_lrg[i]))
                    self.por = np.append(self.por, 1)
                    print "Variable %d fully relaxed" % (i - self.p + 1)

                # put a into A matrix, draw X_i for 'fixed' matrix, update X_out_fix
                A[i, 0:(i + 1)] = a
                X_fix[:, -1] = npran.binomial(1, invlogit(np.dot(X_fix, a)))

        # hang onto A
        self.A = A

        ##############################################
        # Wrapup and get X_ko
        ##############################################

        # If we freshly simulated x, we need to draw ~x based on x
        if self.method == 'fresh_sim':
            self.X_lrg = np.hstack((self.X_orig, np.empty((self.n, self.p))))
            for i in np.arange(self.p, 2 * self.p):
                # need to make sure the knockoff isn't uniformly 0 or 1
                count = 0
                j = 0
                while count == 0 or count == self.n:
                    # first five times we try regenerating
                    if j < 5:
                        self.X_lrg[:, i] = npran.binomial(
                            1,
                            invlogit(
                                np.dot(
                                    np.hstack((self.X_lrg[:, 0:i],
                                               np.ones((self.n, 1)))),
                                    A[i, 0:(i + 1)])))
                        if j > 0:
                            print "Knockoff regenerated to avoid constant value"
                    # otherwise, just randomly flip a few bits
                    else:
                        print "Random noise added to knockoff to avoid constant value"
                        self.X_lrg[:, i] = np.where(
                            npran.binomial(1, .01 * np.ones(self.n)),
                            1 - self.X_lrg[:, i], self.X_lrg[:, i])
                    count = np.sum(self.X_lrg[:, i])
                    j += 1

        elif self.method == 'bootstrap':
            # since we've been drawing the X along the way, can subset X_fix to get X_ko
            self.X_lrg = np.concatenate((self.X_orig, X_fix[0::repl, self.p:]),
                                        axis=1)

        elif self.method == 'approx':
            self.X_lrg = X_fix

        # Evaluate how close we are emperically to M
        self.M_distortion = nplin.norm(
            self.M[:, self.p:] - np.dot(self.X_lrg.T, self.X_lrg[:, self.p:]) /
            self.n) / nplin.norm(self.M[:, self.p:])

        self.emp_ko_corr = np.corrcoef(
            self.X_lrg, rowvar=0,
            bias=1)[:self.p, self.p:2 * self.p][np.identity(self.p) == 1]
        if np.sum(np.isnan(self.emp_ko_corr)) > 0:
            print "There were %d out of %d variables who had missing correlation" % (
                np.sum(np.isnan(self.emp_ko_corr)), self.p)
 def _vector_objective(self,a,X_fix,m):
     return np.mean(invlogit(np.dot(X_fix,a))[:,np.newaxis]*X_fix,axis=0) - m
def dinvlogit(x):
    '''Derivative of logit function at each point of vextor x'''
    return invlogit(x)*(1-invlogit(x))
    def _binary_knockoff(self):
        ''' This creates the new binary knockoffs, which are random multivariate bernoulli which should have, in expectation,
        the same first two moments as X. Only will work if X is all binaray '''

        self._derive_crossmoments()

        ####################################################
        # Get the data corresponding to the original x for the simulations 
        ####################################################

        A = np.zeros((2*self.p,2*self.p))

        # Simulate fresh x based on the original data
        if self.method == 'fresh_sim': 
            # Fit the upperhalf of A on the actual data, making this easier
            A[1,1] = logit(self.mu_lrg[0])
            for i in np.arange(0,self.p):
            # inject the paramters from the logit X_i ~ X_1 + ... + X_(i-1) + 1 into the ith row of A
                A[i,0:(i+1)] = sm.GLM(self.X_orig[:,i],np.hstack((self.X_orig[:,0:i],np.ones((self.n,1)))),family=sm.families.Binomial()).fit().params

            # Then draw the X
            X_fix = draw_random_binary(self.MCsize,A[:self.p,:self.p])
            nMC = self.MCsize
        
        # just repeat X a bunch of times
        elif self.method=='bootstrap':
            #Rather than simulate entirely new X at each stage, I will used a fixed set of X_1 ... X_(i-1)
            #This definitely makes sense for the orignal X vars (why simulate when we already have it), but possibly less sense for the knockoffs
            #To get the desired size of montecarlo simulation, I will replicate X until it has at least self.MCsize rows
            repl = np.min((self.MCsize//self.n,1))
            X_fix = np.repeat(self.X_orig,repl,0)
            nMC   = X_fix.shape[0]

        elif self.method== 'approx':
            X_fix     = self.X_orig

        ###################################################
        # Derive remaing A from Newton-Raphson
        ###################################################

        if self.method== 'approx':
            covinv   = np.diag(self.M - np.outer(self.mu_lrg,self.mu_lrg))**-1
            upwt = 1
            X_tmp = np.hstack((self.X_orig,np.ones((self.n,1)))).T/self.n
            for i in np.arange(self.p,2*self.p):
                m = np.append(self.M[i,0:self.p],self.M[i,i])
                wt = np.diag(np.append(np.append(np.ones(i-self.p),upwt),np.ones(2*self.p-i)))*np.diag(np.append(covinv[:self.p],covinv[i]))
         
                ps = cvx.Variable(self.n)
                objective = cvx.Minimize(cvx.norm( wt*(X_tmp * ps - m) ,2))
                constraints = [0 <= ps, ps<=1]
                prob = cvx.Problem(objective,constraints)
                prob.solve(solver=cvx.SCS,max_iters=100)
         
                X_fix = np.hstack((X_fix,ps.value))
         
            for j in range(2):
                # make sure order of variables is mixed up
                for i in np.arange(self.p,2*self.p)[npran.permutation(self.p)]:
                    X_tmp = X_fix
                    X_tmp[:,i] = np.ones((self.n,1))
                    m = self.M[i,:]
                    wt = np.diag(np.append(np.append(np.ones(i-self.p),upwt),np.ones(3*self.p-i-1)))*np.diag(covinv)
             
                    ps = cvx.Variable(self.n)
                    objective = cvx.Minimize(cvx.norm(wt*(X_tmp.T/self.n * ps - m) ,2))
                    constraints = [0 <= ps, ps<=1]
                    prob = cvx.Problem(objective,constraints)
                    prob.solve(solver=cvx.SCS,max_iters=100)
             
                    X_fix[:,i] = ps.value
         
                    # draw the actual responses 
                    X_fix[:,self.p:] = npran.binomial(1,cutoff(X_fix[:,self.p:]))
                    
        if not self.method =='approx':
            # Largely from 5.1 in Schafer, including notation
         
            # the current value and the derivatves are derived by simulation
         
            # sequence of portions between 0 and 1 to deal with case of ill conditioned hessian
            por_seq = np.arange(0,1,.25)
            self.por = np.empty((1,0))
         
         
            for i in np.arange(self.p,2*self.p):
                # Now, the Newton-Raphson steps
                # If the hessian becomes singular, we will relax the cross moment requirements, as described in Schafer 5.1 point 2
                # the idea is that the problem is relaxed until X_i is independent of all prior vars
                # as por increases, covariance drops
                # a is the row we are adding to A. Initialize with values as if independent all other vars

                a = np.append(np.zeros(i),logit(self.mu_lrg[i]))
                X_fix     = np.hstack((X_fix,np.ones((nMC,1))))
         
                for por in por_seq:
                    # m are the cross moments we are trying to fit
                    m = (1-por)*self.M[i,0:(i+1)] + por*self.M[i,i]*np.append(np.diag(self.M)[0:i],1)
         
                    # Minimize the actual difference vector
                    opt = root(self._vector_objective,
                            a,
                            args=(X_fix,m),
                            method='anderson',
                            options = {'maxiter':(i*2+150),'fatol':1E-5,'jac_options':{'M':20}}
                            )
         
                    # update a to most recent estimate, even without convergence
                    a = opt.x
         
                    # Stop once optimal has been reached
                    if opt.success:
                        self.por = np.append(self.por,por)
                        if por>0:
                            print "Variable %d relaxed by tau=%.2f" % (i-self.p+1,por)
                        break
         
                if not opt.success:
                    a = np.append(np.zeros(i),logit(self.mu_lrg[i]))
                    self.por = np.append(self.por,1)
                    print "Variable %d fully relaxed" % (i-self.p+1)
         
                # put a into A matrix, draw X_i for 'fixed' matrix, update X_out_fix
                A[i,0:(i+1)] = a
                X_fix[:,-1] = npran.binomial(1,invlogit(np.dot(X_fix,a)))

        # hang onto A
        self.A = A

         
            ##############################################
            # Wrapup and get X_ko
            ##############################################


        # If we freshly simulated x, we need to draw ~x based on x
        if self.method=='fresh_sim': 
            self.X_lrg = np.hstack((self.X_orig,np.empty((self.n,self.p))))
            for i in np.arange(self.p,2*self.p):
                # need to make sure the knockoff isn't uniformly 0 or 1
                count = 0
                j=0
                while count==0 or count==self.n:
                    # first five times we try regenerating
                    if j<5: 
                        self.X_lrg[:,i] = npran.binomial(1,invlogit(np.dot(np.hstack((self.X_lrg[:,0:i],np.ones((self.n,1)))),A[i,0:(i+1)])))
                        if j>0:
                            print "Knockoff regenerated to avoid constant value"
                    # otherwise, just randomly flip a few bits
                    else:
                        print "Random noise added to knockoff to avoid constant value"
                        self.X_lrg[:,i] = np.where(npran.binomial(1,.01*np.ones(self.n)),1-self.X_lrg[:,i],self.X_lrg[:,i])
                    count = np.sum(self.X_lrg[:,i])
                    j += 1

        elif self.method=='bootstrap':
            # since we've been drawing the X along the way, can subset X_fix to get X_ko
            self.X_lrg = np.concatenate((self.X_orig,X_fix[0::repl,self.p:]), axis=1)

        elif self.method=='approx':
            self.X_lrg = X_fix

        # Evaluate how close we are emperically to M
        self.M_distortion = nplin.norm(self.M[:,self.p:]-np.dot(self.X_lrg.T,self.X_lrg[:,self.p:])/self.n)/nplin.norm(self.M[:,self.p:])

        self.emp_ko_corr= np.corrcoef(self.X_lrg,rowvar=0,bias=1)[:self.p,self.p:2*self.p][np.identity(self.p)==1]
        if np.sum(np.isnan(self.emp_ko_corr))>0:
            print "There were %d out of %d variables who had missing correlation" % (np.sum(np.isnan(self.emp_ko_corr)),self.p)
Beispiel #14
0
def plot_dataset(X0_l,
                 X0_h,
                 X,
                 cov_l,
                 cov_d,
                 K,
                 K_noiseless,
                 K_s,
                 f,
                 f_latent,
                 low_fidelity_error_inds=None,
                 trace_vals=None,
                 trace_high_only_vals=None,
                 post_mean_hf_label_regr=None,
                 is_legend_on=False):
    n_l = len(X0_l)
    n_h = len(X0_h)

    fig, ax = plt.subplots(figsize=(14, 4))

    ax.scatter(X0_h,
               f[n_l:n_l + n_h],
               s=40,
               color='black',
               label='$D_H$',
               zorder=2)

    if low_fidelity_error_inds is not None:
        low_fidelity_correct_inds = np.setdiff1d(np.arange(n_l),
                                                 low_fidelity_error_inds)
        ax.scatter(X0_l[low_fidelity_correct_inds],
                   f[low_fidelity_correct_inds],
                   s=20,
                   color='grey',
                   label='$D_L$',
                   marker='o',
                   facecolor='none')
        ax.scatter(X0_l[low_fidelity_error_inds],
                   f[low_fidelity_error_inds],
                   color='coral',
                   marker='x',
                   label='Ошибки в $D_L$')
    else:
        ax.scatter(X0_l,
                   f[:n_l],
                   s=20,
                   color='grey',
                   label='$D_L$',
                   marker='o',
                   facecolor='none')

    plt.hlines(0.5,
               0,
               3,
               linestyle=':',
               linewidth=1,
               color='grey',
               label='Граница классов')

    L_hf = np.linalg.cholesky(K.eval())
    alpha_hf = np.linalg.solve(L_hf.T, np.linalg.solve(L_hf, f_latent))
    post_mean_hf = invlogit(np.dot(K_s.T.eval(), alpha_hf))
    ax.plot(X,
            post_mean_hf,
            color='g',
            alpha=0.8,
            label='Истинное $\sigma(f_H)$')

    if post_mean_hf_label_regr is not None:
        ax.plot(X,
                post_mean_hf_label_regr,
                color='gray',
                label='Регрессия',
                linestyle='--')

    if trace_high_only_vals is not None:
        ax.plot(X,
                invlogit(np.mean(logit(trace_high_only_vals), axis=0)),
                color='blue',
                label='$p(c(x_*)=1|D_H, x_*)$',
                linestyle='--')

    if trace_vals is not None:
        ax.plot(X,
                invlogit(np.mean(logit(trace_vals), axis=0)),
                color='darkblue',
                label='$p(c(x_*)=1|D_L, D_H, x_*)$')

    plt.xlabel('$\Omega$', fontsize=16)
    plt.ylabel('Значения классов и прогнозов', fontsize=14)
    ax.set_xlim(0, 1)
    ax.set_ylim(-0.1, 1.1)
    if is_legend_on:
        ax.legend(bbox_to_anchor=(1, 1), loc=2, fontsize=14)
Beispiel #15
0
 def _vector_objective(self, a, X_fix, m):
     return np.mean(invlogit(np.dot(X_fix, a))[:, np.newaxis] * X_fix,
                    axis=0) - m
Beispiel #16
0
 def _new_obj(self, eta, X, m):
     return np.append(
         np.dot(X.T, invlogit(eta)) - m, np.zeros(X.shape[0] - X.shape[1]))
 def _new_obj(self,eta,X,m):
     return np.append(np.dot(X.T,invlogit(eta)) - m,np.zeros(X.shape[0]-X.shape[1]))