Beispiel #1
0
 def f(mixing,G0_standardized_val=G0_standardized_val,G1_standardized_val=G1_standardized_val,covar=covar,y=y,**kwargs):
     _mix(G, G0_standardized_val,G1_standardized_val,mixing)
     lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)
     result = lmm.findH2()
     if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']):
         resmin[0]=result
     return result['nLL']
Beispiel #2
0
 def f(mixing,G0_standardized_val=G0_standardized_val,G1_standardized_val=G1_standardized_val,covar=covar,y=y,**kwargs):
     _mix(G, G0_standardized_val,G1_standardized_val,mixing)
     lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)
     result = lmm.findH2()
     if (resmin[0] is None) or (result['nLL']<resmin[0]['nLL']):
         resmin[0]=result
     return result['nLL']
Beispiel #3
0
    def run_gwas(self):
        """
        invoke all steps in the right order
        """
        with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

            from fastlmm.inference.lmm_cov import LMM as fastLMM

            if self.train_pcs is None and self.train_snps is not None:
                assert self.mixing == 0.0
                G = self.train_snps
            elif self.train_pcs is not None and self.train_snps is None:
                assert self.mixing == 0.0
                G = self.train_pcs
            else:
                logging.info("concat pcs, mixing {0}".format(self.mixing))
                G = np.concatenate(
                    (np.sqrt(1.0 - self.mixing) * self.train_snps,
                     np.sqrt(self.mixing) * self.train_pcs), 1)

            #TODO: make sure low-rank case is handled correctly
            lmm = fastLMM(X=self.cov, Y=self.phen, G=G, K=None)

            if self.findh2:
                opt = lmm.findH2(nGridH2=100)
                h2 = opt['h2']
                assert self.delta is None, "either findh2 or set delta"
            else:
                h2 = 0.0
                assert not self.delta is None
                logging.info("using externally provided delta")

            res = lmm.nLLeval(h2=h2,
                              delta=self.delta,
                              dof=None,
                              scale=1.0,
                              penalty=0.0,
                              snps=self.test_snps)

            chi2stats = res['beta'] * res['beta'] / res['variance_beta']

            self.p_values = stats.chi2.sf(chi2stats, 1)[:, 0]
            self.p_values_F = stats.f.sf(
                chi2stats, 1, G.shape[0] -
                (lmm.linreg.D +
                 1))[:, 0]  #note that G.shape is the number of individuals
            self.p_idx = np.argsort(self.p_values)
            self.sorted_p_values = self.p_values[self.p_idx]

            self.p_idx_F = np.argsort(self.p_values_F)
            self.sorted_p_values_F = self.p_values_F[self.p_idx_F]
Beispiel #4
0
    def run_gwas(self):
        """
        invoke all steps in the right order
        """

        from fastlmm.inference.lmm_cov import LMM as fastLMM

        if self.train_pcs is None and self.train_snps is not None:
            assert self.mixing == 0.0
            G = self.train_snps        
        elif self.train_pcs is not None and self.train_snps is None:
            assert self.mixing == 0.0
            G = self.train_pcs
        else:
            logging.info("concat pcs, mixing {0}".format(self.mixing))
            G = np.concatenate((np.sqrt(1.0-self.mixing) * self.train_snps, np.sqrt(self.mixing) * self.train_pcs),1)

        #TODO: make sure low-rank case is handled correctly
        lmm = fastLMM(X=self.cov, Y=self.phen, G=G, K=None)

        if self.findh2:
            opt = lmm.findH2(nGridH2=100)
            h2 = opt['h2']
            assert self.delta is None, "either findh2 or set delta"
        else:
            h2 = 0.0
            assert not self.delta is None
            logging.info("using externally provided delta")

        res = lmm.nLLeval(h2=h2, delta=self.delta, dof=None, scale=1.0, penalty=0.0, snps=self.test_snps)
        
        
        chi2stats = res['beta']*res['beta']/res['variance_beta']
        
        self.p_values = stats.chi2.sf(chi2stats,1)[:,0]
        self.p_values_F = stats.f.sf(chi2stats,1,G.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals
        self.p_idx = np.argsort(self.p_values)        
        self.sorted_p_values = self.p_values[self.p_idx]

        self.p_idx_F = np.argsort(self.p_values_F)
        self.sorted_p_values_F = self.p_values_F[self.p_idx_F]
Beispiel #5
0
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized,
                 mixing, #!!test mixing and G1
                 h2, log_delta,
                 cache_file):

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1))))  #We always add 1's to the end.
    y =  pheno['vals']

    from pysnptools.standardizer import DiagKtoN

    assert mixing is None or 0.0 <= mixing <= 1.0

    if cache_file is not None and os.path.exists(cache_file):
        lmm = fastLMM(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
    else:
        # combine two kernels (normalize kernels to diag(K)=N
        G0_standardized_val = DiagKtoN(G0_standardized.val.shape[0]).standardize(G0_standardized.val)
        G1_standardized_val = DiagKtoN(G1_standardized.val.shape[0]).standardize(G1_standardized.val)

        if mixing == 0.0 or G1_standardized.sid_count == 0:
            G = G0_standardized.val
        elif mixing == 1.0 or G0_standardized.sid_count == 0:
            G = G1_standardized.val
        else:
            G = np.empty((G0_standardized.iid_count,G0_standardized.sid_count+G1_standardized.sid_count))
            if mixing is None:
                mixing, h2 = _find_mixing(G, covar, G0_standardized_val, G1_standardized_val, h2, y)
            _mix(G, G0_standardized_val,G1_standardized_val,mixing)
        
        #TODO: make sure low-rank case is handled correctly
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)


    if h2 is None:
        result = lmm.findH2()
        h2 = result['h2']
    logging.info("h2={0}".format(h2))

    snps_read = test_snps.read().standardize()
    res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val)

    if cache_file is not None and not os.path.exists(cache_file):
        pstutil.create_directory_if_necessary(cache_file)
        np.savez(cache_file, lmm.U,lmm.S) #using np.savez instead of pickle because it seems to be faster to read and write


    beta = res['beta']
        
    chi2stats = beta*beta/res['variance_beta']
    #p_values = stats.chi2.sf(chi2stats,1)[:,0]
    if G0_standardized is not None:
        assert G.shape[0] == lmm.U.shape[0]
    p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)


    items = [
        ('SNP', snps_read.sid),
        ('Chr', snps_read.pos[:,0]), 
        ('GenDist', snps_read.pos[:,1]),
        ('ChrPos', snps_read.pos[:,2]), 
        ('PValue', p_values),
        ('SnpWeight', beta[:,0]),
        ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])),
        ('SnpFractVarExpl', np.sqrt(res['fraction_variance_explained_beta'][:,0])),
        ('Nullh2', np.zeros((snps_read.sid_count)) + h2)
    ]
    frame = pd.DataFrame.from_items(items)

    return frame
Beispiel #6
0
def _internal_single(G0_standardized, test_snps, pheno,covar, G1_standardized,
                 mixing, #!!test mixing and G1
                 external_log_delta, min_log_delta, max_log_delta):


    covar = np.hstack((covar['vals'],np.ones((test_snps.iid_count, 1))))  #We always add 1's to the end.
    y =  pheno['vals']

    

    assert 0.0 <= mixing <= 1.0
    
    # combine two kernels (normalize kernels to diag(K)=N
    if mixing == 0.0:
        G0_standardized_val = 1./np.sqrt((G0_standardized.val**2).sum() / float(G0_standardized.val.shape[0])) * G0_standardized.val
        G = G0_standardized_val
    elif mixing == 1.0:
        G1_standardized_val = 1./np.sqrt((G1_standardized.val**2).sum() / float(G1_standardized.val.shape[0])) * G1_standardized.val
        G = G1_standardized_val
    else:
        assert G1_standardized.sid_count > 0, "If a nonzero mixing weight is given, G1 is required"
        logging.info("concat G1, mixing {0}".format(mixing))
        
        #TODO: make this efficient (write C-code to perform this operation in-place)!!
        G0_standardized_val = 1./np.sqrt((G0_standardized.val**2).sum() / float(G0_standardized.val.shape[0])) * G0_standardized.val
        G1_standardized_val = 1./np.sqrt((G1_standardized.val**2).sum() / float(G1_standardized.val.shape[0])) * G1_standardized.val
        
        #G = np.concatenate((np.sqrt(1.0-mixing) * G0_norm, np.sqrt(mixing) * G1_norm),1)
        G = np.concatenate((np.sqrt(1.0-mixing) * G0_standardized_val, np.sqrt(mixing) * G1_standardized_val),1)
        

    #TODO: make sure low-rank case is handled correctly
    from fastlmm.inference.lmm_cov import LMM as fastLMM
    lmm = fastLMM(X=covar, Y=y, G=G, K=None)

    if external_log_delta is None:
        result = lmm.find_log_delta(sid_count=1, min_log_delta=min_log_delta, max_log_delta=max_log_delta)
        external_log_delta = result['log_delta']
    internal_delta = np.exp(external_log_delta)
    logging.info("internal_delta={0}".format(internal_delta))
    logging.info("external_log_delta={0}".format(external_log_delta))

    snps_read = test_snps.read().standardize()
    res = lmm.nLLeval(delta=internal_delta, dof=None, scale=1.0, penalty=0.0, snps=snps_read.val)
    beta = res['beta']
        
    chi2stats = beta*beta/res['variance_beta']
    #p_values = stats.chi2.sf(chi2stats,1)[:,0]
    p_values = stats.f.sf(chi2stats,1,G.shape[0]-3)[:,0]#note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)


    items = [
                ('SNP', snps_read.sid),
                ('Chr', snps_read.pos[:,0]), 
                ('GenDist', snps_read.pos[:,1]),
                ('ChrPos', snps_read.pos[:,2]), 
                ('PValue', p_values),
                ('SnpWeight', beta[:,0]),
                ('SnpWeightSE', np.sqrt(res['variance_beta'][:,0])),
                ('NullLogDelta', np.zeros((snps_read.sid_count)) + external_log_delta)
            ]
    frame = pd.DataFrame.from_items(items)

    return frame
    def _getScalesPairwise(self, verbose=False, initDiagonal=False):
        """
		Internal function for parameter initialization
		Uses a single trait model for initializing variances and
		a pairwise model to initialize correlations
		"""
        var = sp.zeros((self.P, 2))

        if initDiagonal:
            #1. fit single trait model
            if verbose:
                print '.. fit single-trait model for initialization'
            vc = VarianceDecomposition(self.Y[:, 0:1])
            for term_i in range(self.n_randEffs):
                if term_i == self.noisPos:
                    vc.addRandomEffect(is_noise=True)
                else:
                    K = self.vd.getTerm(term_i).getK()
                    vc.addRandomEffect(K=K)
            scales0 = sp.sqrt(0.5) * sp.ones(2)

            for p in range(self.P):
                if verbose: print '   .. trait %d' % p
                vc.setY(self.Y[:, p:p + 1])
                conv = vc.optimize(scales0=scales0)
                if not conv:
                    print 'warning initialization not converged'
                var[p, :] = vc.getVarianceComps()[0, :]

        elif fastlmm_present:
            if verbose:
                print '.. fit single-trait model for initialization (using fastlmm)'
            for p in range(self.P):
                if verbose: print '   .. trait %d' % p
                covariates = None
                for term_i in range(self.n_randEffs):
                    if term_i == self.noisPos:
                        pass
                    else:
                        K = self.vd.getTerm(term_i).getK()
                varY = sp.var(self.Y[:, p:p + 1])
                lmm = fastLMM(X=covariates, Y=self.Y[:, p:p + 1], G=None, K=K)
                opt = lmm.findH2(nGridH2=100)
                h2 = opt['h2']
                var[p, :] = h2 * varY
                var[p, self.noisPos] = (1.0 - h2) * varY
                #import ipdb;ipdb.set_trace()
        else:
            if verbose:
                print '.. random initialization of diagonal'
            var = sp.random.randn(var.shape[0], var.shape[1])
            var = var * var + 0.001
        #2. fit pairwise model
        if verbose:
            print '.. fit pairwise model for initialization'
        vc = VarianceDecomposition(self.Y[:, 0:2])
        for term_i in range(self.n_randEffs):
            if term_i == self.noisPos:
                vc.addRandomEffect(is_noise=True, trait_covar_type='freeform')
            else:
                K = self.vd.getTerm(term_i).getK()
                vc.addRandomEffect(K=K, trait_covar_type='freeform')
        rho_g = sp.ones((self.P, self.P))
        rho_n = sp.ones((self.P, self.P))
        for p1 in range(self.P):
            for p2 in range(p1):
                if verbose:
                    print '   .. fit pair (%d,%d)' % (p1, p2)
                vc.setY(self.Y[:, [p1, p2]])
                scales0 = sp.sqrt(
                    sp.array([
                        var[p1, 0], 1e-4, var[p2, 0], 1e-4, var[p1, 1], 1e-4,
                        var[p2, 1], 1e-4
                    ]))
                conv = vc.optimize(scales0=scales0)
                if not conv:
                    print 'warning initialization not converged'
                Cg = vc.getTraitCovar(0)
                Cn = vc.getTraitCovar(1)
                rho_g[p1, p2] = Cg[0, 1] / sp.sqrt(Cg.diagonal().prod())
                rho_n[p1, p2] = Cn[0, 1] / sp.sqrt(Cn.diagonal().prod())
                rho_g[p2, p1] = rho_g[p1, p2]
                rho_n[p2, p1] = rho_n[p1, p2]
        #3. init
        Cg0 = rho_g * sp.dot(sp.sqrt(var[:, 0:1]), sp.sqrt(var[:, 0:1].T))
        Cn0 = rho_n * sp.dot(sp.sqrt(var[:, 1:2]), sp.sqrt(var[:, 1:2].T))
        offset_g = abs(sp.minimum(sp.linalg.eigh(Cg0)[0].min(), 0)) + 1e-4
        offset_n = abs(sp.minimum(sp.linalg.eigh(Cn0)[0].min(), 0)) + 1e-4
        Cg0 += offset_g * sp.eye(self.P)
        Cn0 += offset_n * sp.eye(self.P)
        Lg = sp.linalg.cholesky(Cg0)
        Ln = sp.linalg.cholesky(Cn0)
        Cg_params0 = sp.concatenate([Lg[:, p][:p + 1] for p in range(self.P)])
        Cn_params0 = sp.concatenate([Ln[:, p][:p + 1] for p in range(self.P)])
        scales0 = sp.concatenate(
            [Cg_params0, 1e-2 * sp.ones(1), Cn_params0, 1e-2 * sp.ones(1)])

        return scales0
Beispiel #8
0

####load GRM, covariates, and pheno
G0 = np.array(ro.r('G0<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/G0_mat.npy")'))
covar = np.array(ro.r('covar<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/covar_mat.npy")'))
y = np.array(ro.r('y<-npyLoad("/home/mccuem/norto253/WP_2Million/Elaine_v3/R_scripts/y_mat.npy")'))


#norm_factor = 1./np.sqrt((G0**2).sum() / float(G0.shape[0]))
#G0_standardized_val = norm_factor * G0
from pysnptools.standardizer import DiagKtoN
#G0_standardized_val = DiagKtoN(G0.shape[0]).standardize(G0)
G0_standardized_val = G0

from fastlmm.inference.lmm_cov import LMM as fastLMM
lmm = fastLMM(X=covar, Y=y, G=G0_standardized_val)
result = lmm.findH2()
#dir(lmm) #lists attributes

if result['h2'] > -1:
    h2 = result['h2']
else:
    h2 = result['h2'][0]

residual_var = 1-h2
delta = residual_var/h2
m1_df = sum(lmm.S/(lmm.S + delta))
m1 = result['nLL'][0]*-1

#nr, nc = G0_standardized_val.shape
#G0_standardized_val_vec = ro.FloatVector(G0_standardized_val.transpose().reshape((G0_standardized_val.size)))
    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind, 1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1. / np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False

        # normalize to diag(K) = N
        norm_factor = 1. / np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg

        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False

        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1. / np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor,
                                                 norm_factor_check,
                                                 decimal=1)

        for kfold_idx, (train_idx, test_idx) in enumerate(
                KFold(num_ind,
                      n_folds=self.n_folds,
                      random_state=self.random_state,
                      shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx,
                                                                axis=1)
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx,
                                                                  axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train,
                                G0_train,
                                y_train,
                                delta=None,
                                train_pcs=None,
                                mixing=0.0,
                                cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time() - t0)))
            else:
                logging.info("using linear regression to rank features")
                _F, _pval = lin_reg.f_regression_block(
                    lin_reg.f_regression_cov_alt,
                    G0_train,
                    y_train,
                    blocksize=10000,
                    C=cov_train)

            feat_idx = np.argsort(_pval)

            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1. / np.sqrt(
                    (G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor

                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor,
                                                         norm_factor_check,
                                                         decimal=1)
                    np.testing.assert_almost_equal(
                        sum(np.diag(G_fs_train.dot(G_fs_train.T))),
                        G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train,
                                  Y=y_train[:, np.newaxis],
                                  K=K_bg_train)

                W = G_fs_train.copy()
                UGup, UUGup = lmm.rotate(W)

                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10,
                                    minH2=0.0,
                                    maxH2=0.99999,
                                    i_up=i_up,
                                    i_G1=i_G1,
                                    UW=UGup,
                                    UUW=UUGup)
                logging.info("time taken for k=%i: %s" %
                             (max_k, str(time.time() - t0)))

                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1 - h2) / h2
                #res_cov = res

                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)

                # predict on test set
                lmm.setTestData(Xstar=cov_test,
                                G0star=G_bg_test,
                                G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(
                        G_fs_test.T) + (1.0 - a2) * K_bg_test
                    ll = lmm.nLLeval_test(y_test,
                                          res["beta"],
                                          sigma2=res["sigma2"],
                                          delta=delta,
                                          Kstar_star=K_test_test,
                                          robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test,
                                               res["beta"],
                                               sigma2=res["sigma2"],
                                               delta=delta,
                                               Kstar_star=None,
                                               robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx] = ll

            logging.info("time taken for fold: %s" % str(time.time() - t0))

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" %
                     (best_k, best_mix, best_delta))

        # final scan
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg,
                            G0,
                            y,
                            delta=None,
                            train_pcs=None,
                            mixing=0.0,
                            cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F, _pval = lin_reg.f_regression_block(
                lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)

        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta
    def run_select(self, G0, G_bg, y, cov=None):
        """set up two kernel feature selection
    
        Parameters
        ----------
        G0 : numpy array of shape (num_ind, num_snps)
            Data matrix from which foreground snps will be selected

        G0_bg : numpy array of shape (num_ind, num_snps)
            Data matrix containing background snps on which will be conditioned

        y : numpy vector of shape (num_ind, )
            Vector of phenotypes

        cov : numpy array of shape (num_ind, num_covariates) or None
            Covariates to be used as fixed effects

        Returns
        -------
        best_k, feat_idx, best_mix, best_delta: tuple(int, np.array(int), float, float)
            best_k is the best number of SNPs selected,
            feat_idx is a np.array of integers denoting the indices of these snps,
            best_mix is the best mixing coefficient between foreground and background kernel,
            best_delta is the best regularization coefficient
        """

        num_ind = len(y)

        if cov is None:
            cov = np.ones((num_ind,1))
        else:
            logging.info("normalizing covariates")
            cov = cov.copy()
            cov = 1./np.sqrt((cov**2).sum() / float(cov.shape[0])) * cov
        cov.flags.writeable = False
        
        # normalize to diag(K) = N
        norm_factor = 1./np.sqrt((G_bg**2).sum() / float(G_bg.shape[0]))

        # we copy in case G and G_bg are pointing to the same object
        G_bg = norm_factor * G_bg
       
        K_bg_full = G_bg.dot(G_bg.T)
        K_bg_full.flags.writeable = False
        
        # some asserts
        np.testing.assert_almost_equal(sum(np.diag(K_bg_full)), G_bg.shape[0])
        if self.debug:
            norm_factor_check = 1./np.sqrt(G_bg.shape[1])
            np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
            

        for kfold_idx, (train_idx, test_idx) in enumerate(KFold(num_ind, n_folds=self.n_folds, random_state=self.random_state, shuffle=True)):

            t0 = time.time()
            logging.info("running fold: %i" % kfold_idx)

            y_train = y.take(train_idx, axis=0)
            y_test = y.take(test_idx, axis=0)
            G0_train = G0.take(train_idx, axis=0)
            G0_test = G0.take(test_idx, axis=0)

            G_bg_train = G_bg.take(train_idx, axis=0)
            G_bg_test = G_bg.take(test_idx, axis=0)

            cov_train = cov.take(train_idx, axis=0)
            cov_test = cov.take(test_idx, axis=0)

            # write protect data
            y_train.flags.writeable = False
            y_test.flags.writeable = False
            G0_train.flags.writeable = False
            G0_test.flags.writeable = False
            G_bg_train.flags.writeable = False
            G_bg_test.flags.writeable = False
            cov_train.flags.writeable = False
            cov_test.flags.writeable = False

            # precompute background kernel
            K_bg_train = K_bg_full.take(train_idx, axis=0).take(train_idx, axis=1) 
            K_bg_train.flags.writeable = False

            if self.measure != "mse":
                K_bg_test = K_bg_full.take(test_idx, axis=0).take(test_idx, axis=1)
                K_bg_test.flags.writeable = False

            # rank features
            if self.order_by_lmm:
                logging.info("using linear mixed model to rank features")
                t0 = time.time()
                gwas = FastGwas(G_bg_train, G0_train, y_train, delta=None, train_pcs=None, mixing=0.0, cov=cov_train)
                gwas.run_gwas()
                _pval = gwas.p_values
                logging.info("time taken: %s" % (str(time.time()-t0)))
            else:
                logging.info("using linear regression to rank features")
                _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0_train, y_train, blocksize=10000, C=cov_train)

            feat_idx = np.argsort(_pval)
            
            for k_idx, max_k in enumerate(self.grid_k):

                feat_idx_subset = feat_idx[0:max_k]
                G_fs_train = G0_train.take(feat_idx_subset, axis=1)
                G_fs_test = G0_test.take(feat_idx_subset, axis=1)

                # normalize to sum(diag)=N
                norm_factor = 1./np.sqrt((G_fs_train**2).sum() / float(G_fs_train.shape[0]))

                G_fs_train *= norm_factor
                G_fs_test *= norm_factor
                                
                G_fs_train.flags.writeable = False
                G_fs_test.flags.writeable = False

                # asserts
                if self.debug:
                    norm_factor_check = 1.0 / np.sqrt(max_k)
                    np.testing.assert_array_almost_equal(norm_factor, norm_factor_check, decimal=1)
                    np.testing.assert_almost_equal(sum(np.diag(G_fs_train.dot(G_fs_train.T))), G_fs_train.shape[0])

                logging.info("k: %i" % (max_k))

                # use LMM
                from fastlmm.inference.lmm_cov import LMM as fastLMM

                if G_bg_train.shape[1] <= G_bg_train.shape[0]:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], G=G_bg_train)
                else:
                    lmm = fastLMM(X=cov_train, Y=y_train[:,np.newaxis], K=K_bg_train)

                W = G_fs_train.copy()
                UGup,UUGup = lmm.rotate(W)
                
                i_up = np.zeros((G_fs_train.shape[1]), dtype=np.bool)
                i_G1 = np.ones((G_fs_train.shape[1]), dtype=np.bool)
                t0 = time.time()
                res = lmm.findH2_2K(nGridH2=10, minH2=0.0, maxH2=0.99999, i_up=i_up, i_G1=i_G1, UW=UGup, UUW=UUGup)
                logging.info("time taken for k=%i: %s" % (max_k, str(time.time()-t0)))
                
                # recover a2 from alternate parameterization
                a2 = res["h2_1"] / float(res["h2"] + res["h2_1"])
                h2 = res["h2"] + res["h2_1"]
                delta = (1-h2) / h2
                #res_cov = res


                # do final prediction using lmm.py
                from fastlmm.inference import LMM
                lmm = LMM(forcefullrank=False)
                lmm.setG(G0=G_bg_train, G1=G_fs_train, a2=a2)
                lmm.setX(cov_train)
                lmm.sety(y_train)

                # we take an additional step to estimate betas on covariates (not given from new model)
                res = lmm.nLLeval(delta=delta, REML=True)
                
                # predict on test set
                lmm.setTestData(Xstar=cov_test, G0star=G_bg_test, G1star=G_fs_test)
                out = lmm.predictMean(beta=res["beta"], delta=delta)

                mse = mean_squared_error(y_test, out)
                logging.info("mse: %f" % (mse))

                self.mse[kfold_idx, k_idx] = mse

                self.mixes[kfold_idx, k_idx] = a2
                self.deltas[kfold_idx, k_idx] = delta

                if self.measure != "mse":
                    K_test_test = a2 * G_fs_test.dot(G_fs_test.T) + (1.0-a2) * K_bg_test 
                    ll = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=K_test_test, robust=True)

                    if self.debug:
                        ll2 = lmm.nLLeval_test(y_test, res["beta"], sigma2=res["sigma2"], delta=delta, Kstar_star=None, robust=True)
                        np.testing.assert_almost_equal(ll, ll2, decimal=4)

                    logging.info("ll: %f" % (ll))
                    self.ll[kfold_idx, k_idx]  = ll
                    

            logging.info("time taken for fold: %s" % str(time.time()-t0))
        

        best_k, best_mix, best_delta = self.select_best_k()

        logging.info("best_k: %i, best_mix: %f, best_delta: %f" % (best_k, best_mix, best_delta))

        # final scan 
        if self.order_by_lmm:
            logging.info("final scan using LMM")
            gwas = FastGwas(G_bg, G0, y, delta=None, train_pcs=None, mixing=0.0, cov=cov)
            gwas.run_gwas()
            _pval = gwas.p_values
            feat_idx = np.argsort(_pval)[0:best_k]
        else:
            logging.info("final scan using LR")
            _F,_pval = lin_reg.f_regression_block(lin_reg.f_regression_cov_alt, G0, y, C=cov, blocksize=10000)
        
        logging.info("number of snps selected: %i" % (best_k))

        return best_k, feat_idx, best_mix, best_delta
Beispiel #11
0
    def _getScalesPairwise(self,verbose=False, initDiagonal=False):
        """
        Internal function for parameter initialization
        Uses a single trait model for initializing variances and
        a pairwise model to initialize correlations
        """
        var = sp.zeros((self.P,2))

        if initDiagonal:
            #1. fit single trait model
            if verbose:
                print('.. fit single-trait model for initialization')
            vc = VarianceDecomposition(self.Y[:,0:1])
            for term_i in range(self.n_randEffs):
                if term_i==self.noisPos:
                    vc.addRandomEffect(is_noise=True)
                else:
                    K = self.vd.getTerm(term_i).getK()
                    vc.addRandomEffect(K=K)
            scales0 = sp.sqrt(0.5)*sp.ones(2)

            for p in range(self.P):
                if verbose: print(('   .. trait %d' % p))
                vc.setY(self.Y[:,p:p+1])
                conv = vc.optimize(scales0=scales0)
                if not conv:
                    print('warning initialization not converged')
                var[p,:] = vc.getVarianceComps()[0,:]

        elif fastlmm_present:
            if verbose:
                print('.. fit single-trait model for initialization (using fastlmm)')
            for p in range(self.P):
                if verbose: print(('   .. trait %d' % p))
                covariates = None
                for term_i in range(self.n_randEffs):
                    if term_i==self.noisPos:
                        pass
                    else:
                        K = self.vd.getTerm(term_i).getK()
                varY = sp.var(self.Y[:,p:p+1])
                lmm = fastLMM(X=covariates, Y=self.Y[:,p:p+1], G=None, K=K)
                opt = lmm.findH2(nGridH2=100)
                h2 = opt['h2']
                var[p,:] = h2 * varY
                var[p,self.noisPos] = (1.0-h2) * varY
                #;ipdb.set_trace()
        else:
            if verbose:
                print('.. random initialization of diagonal')
            var = sp.random.randn(var.shape[0],var.shape[1])
            var = var*var + 0.001
        #2. fit pairwise model
        if verbose:
            print('.. fit pairwise model for initialization')
        vc = VarianceDecomposition(self.Y[:,0:2])
        for term_i in range(self.n_randEffs):
            if term_i==self.noisPos:
                vc.addRandomEffect(is_noise=True,trait_covar_type='freeform')
            else:
                K = self.vd.getTerm(term_i).getK()
                vc.addRandomEffect(K=K,trait_covar_type='freeform')
        rho_g = sp.ones((self.P,self.P))
        rho_n = sp.ones((self.P,self.P))
        for p1 in range(self.P):
            for p2 in range(p1):
                if verbose:
                    print(('   .. fit pair (%d,%d)'%(p1,p2)))
                vc.setY(self.Y[:,[p1,p2]])
                scales0 = sp.sqrt(sp.array([var[p1,0],1e-4,var[p2,0],1e-4,var[p1,1],1e-4,var[p2,1],1e-4]))
                conv = vc.optimize(scales0=scales0)
                if not conv:
                    print('warning initialization not converged')
                Cg = vc.getTraitCovar(0)
                Cn = vc.getTraitCovar(1)
                rho_g[p1,p2] = Cg[0,1]/sp.sqrt(Cg.diagonal().prod())
                rho_n[p1,p2] = Cn[0,1]/sp.sqrt(Cn.diagonal().prod())
                rho_g[p2,p1] = rho_g[p1,p2]
                rho_n[p2,p1] = rho_n[p1,p2]
        #3. init
        Cg0 = rho_g*sp.dot(sp.sqrt(var[:,0:1]),sp.sqrt(var[:,0:1].T))
        Cn0 = rho_n*sp.dot(sp.sqrt(var[:,1:2]),sp.sqrt(var[:,1:2].T))
        offset_g = abs(sp.minimum(sp.linalg.eigh(Cg0)[0].min(),0))+1e-4
        offset_n = abs(sp.minimum(sp.linalg.eigh(Cn0)[0].min(),0))+1e-4
        Cg0+=offset_g*sp.eye(self.P)
        Cn0+=offset_n*sp.eye(self.P)
        Lg = sp.linalg.cholesky(Cg0)
        Ln = sp.linalg.cholesky(Cn0)
        Cg_params0 = sp.concatenate([Lg[:,p][:p+1] for p in range(self.P)])
        Cn_params0 = sp.concatenate([Ln[:,p][:p+1] for p in range(self.P)])
        scales0 = sp.concatenate([Cg_params0,1e-2*sp.ones(1),Cn_params0,1e-2*sp.ones(1)])

        return scales0
Beispiel #12
0
def _internal_single(
        G0_standardized,
        test_snps,
        pheno,
        covar,
        G1_standardized,
        mixing,  #!!test mixing and G1
        h2,
        log_delta,
        cache_file,
        interact_with_snp=None):

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0 / (np.exp(log_delta) + 1)

    covar = np.hstack((covar['vals'], np.ones(
        (test_snps.iid_count, 1))))  #We always add 1's to the end.
    y = pheno['vals']

    from pysnptools.standardizer import DiagKtoN

    assert mixing is None or 0.0 <= mixing <= 1.0

    if cache_file is not None and os.path.exists(cache_file):
        lmm = fastLMM(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data:  #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
    else:
        # combine two kernels (normalize kernels to diag(K)=N
        G0_standardized_val = DiagKtoN(
            G0_standardized.val.shape[0]).standardize(G0_standardized.val)
        G1_standardized_val = DiagKtoN(
            G1_standardized.val.shape[0]).standardize(G1_standardized.val)

        if mixing == 0.0 or G1_standardized.sid_count == 0:
            G = G0_standardized.val
        elif mixing == 1.0 or G0_standardized.sid_count == 0:
            G = G1_standardized.val
        else:
            G = np.empty(
                (G0_standardized.iid_count,
                 G0_standardized.sid_count + G1_standardized.sid_count))
            if mixing is None:
                mixing, h2 = _find_mixing(G, covar, G0_standardized_val,
                                          G1_standardized_val, h2, y)
            _mix(G, G0_standardized_val, G1_standardized_val, mixing)

        #TODO: make sure low-rank case is handled correctly
        lmm = fastLMM(X=covar, Y=y, G=G, K=None, inplace=True)

    if h2 is None:
        result = lmm.findH2()
        h2 = result['h2']
    logging.info("h2={0}".format(h2))

    snps_read = test_snps.read().standardize()

    if interact_with_snp is not None:
        print "interaction with %i" % interact_with_snp
        interact = covar[:, interact_with_snp]
        interact -= interact.mean()
        interact /= interact.std()
        variables_to_test = snps_read.val * interact[:, np.newaxis]
    else:
        variables_to_test = snps_read.val
    res = lmm.nLLeval(h2=h2,
                      dof=None,
                      scale=1.0,
                      penalty=0.0,
                      snps=variables_to_test)

    if cache_file is not None and not os.path.exists(cache_file):
        pstutil.create_directory_if_necessary(cache_file)
        np.savez(
            cache_file, lmm.U, lmm.S
        )  #using np.savez instead of pickle because it seems to be faster to read and write

    beta = res['beta']

    chi2stats = beta * beta / res['variance_beta']
    #p_values = stats.chi2.sf(chi2stats,1)[:,0]
    if G0_standardized is not None:
        assert G.shape[0] == lmm.U.shape[0]
    p_values = stats.f.sf(
        chi2stats, 1, lmm.U.shape[0] - 3
    )[:,
      0]  #note that G.shape is the number of individuals and 3 is the number of fixed effects (covariates+SNP)

    items = [('SNP', snps_read.sid), ('Chr', snps_read.pos[:, 0]),
             ('GenDist', snps_read.pos[:, 1]), ('ChrPos', snps_read.pos[:, 2]),
             ('PValue', p_values), ('SnpWeight', beta[:, 0]),
             ('SnpWeightSE', np.sqrt(res['variance_beta'][:, 0])),
             ('SnpFractVarExpl',
              np.sqrt(res['fraction_variance_explained_beta'][:, 0])),
             ('Nullh2', np.zeros((snps_read.sid_count)) + h2)]
    frame = pd.DataFrame.from_items(items)

    return frame