Example #1
0
def TESTBEFOREUSING_pca_covariates(bedfile, npc, outfile=None):
    '''
    Read in bed file and compute PC covariates (a la Eigenstrat)
    bedfile should include the .bed extension
    returns pccov, U, S
    '''
    import fastlmm.pyplink.plink as pl
    import os.path
    import time
    import scipy.linalg as la
    import numpy as np
    root, ext = os.path.splitext(bedfile)
    print "reading in bed file..."

    t0 = time.time()
    SNPs = Bed(root).read().standardize()
    t1 = time.time()
    print("Elapsed time %.2f seconds" % (t1 - t0))

    [N, M] = snps.shape
    print "found nind=" + str(N) + ", nsnp=" + str(M)
    if M < N:
        print "only doing full rank, should use low rank here"

    #not needed, and in practice, makes no difference
    #mean center the individuals
    #meanval=sp.mean(snps,axis=0)
    #snps=snps-meanval

    print "computing kernel..."
    t0 = time.time()
    K = sp.dot(snps, snps.T)
    t1 = time.time()
    print("Elapsed time %.2f seconds" % (t1 - t0))
    snps = None

    t0 = time.time()
    print "computing svd..."
    [U, S, V] = la.svd(K, full_matrices=False)
    t1 = time.time()
    print("Elapsed time %.2f seconds" % (t1 - t0))

    S = sp.sqrt(S)
    #UShalf=sp.dot(U,sp.diag(S)) #expensive
    UShalf = sp.multiply(U, np.tile(
        S,
        (N,
         1)))  #faster, but not as fast as as_strided which I can't get to work
    pccov = UShalf[:, 0:npc]
    print "done."
    if outfile is not None:
        import fastlmm.util.util as ut
        ut.write_plink_covariates(SNPs['iid'], pccov, outfile)
    return pccov, U, S
Example #2
0
def TESTBEFOREUSING_pca_covariates(bedfile,npc,outfile=None):
    '''
    Read in bed file and compute PC covariates (a la Eigenstrat)
    bedfile should include the .bed extension
    returns pccov, U, S
    '''    
    import fastlmm.pyplink.plink as pl
    import os.path
    import time
    import scipy.linalg as la
    import numpy as np
    root, ext = os.path.splitext(bedfile)
    print "reading in bed file..."

    t0=time.time()   
    SNPs = Bed(root).read().standardize()
    t1=time.time()   
    print ("Elapsed time %.2f seconds" % (t1-t0))

    [N,M]=snps.shape
    print "found nind=" + str(N) + ", nsnp=" + str(M)
    if M<N: 
        print "only doing full rank, should use low rank here"
    
    #not needed, and in practice, makes no difference
    #mean center the individuals    
    #meanval=sp.mean(snps,axis=0)
    #snps=snps-meanval
    
    print "computing kernel..."
    t0=time.time()           
    K=sp.dot(snps,snps.T)  
    t1=time.time()       
    print ("Elapsed time %.2f seconds" % (t1-t0))  
    snps=None
    
    t0=time.time()   
    print "computing svd..."
    [U,S,V] = la.svd(K,full_matrices = False)
    t1=time.time()   
    print ("Elapsed time %.2f seconds" % (t1-t0))  
    
    S=sp.sqrt(S)
    #UShalf=sp.dot(U,sp.diag(S)) #expensive
    UShalf=sp.multiply(U,np.tile(S,(N,1))) #faster, but not as fast as as_strided which I can't get to work
    pccov=UShalf[:,0:npc]
    print "done."
    if outfile is not None:
        import fastlmm.util.util as ut        
        ut.write_plink_covariates(SNPs['iid'],pccov,outfile)
    return pccov,U,S
Example #3
0
    def linreg_entire_dataset(self, output_prefix):
        self.run_once()

        # We do a little extra work here. We write the pcs out here because they will be known here and we know this method will only be run
        # once even on a big cluster jobs
        if output_prefix is not None and self.num_pcs > 0:
            filename = self.pcs_filename(output_prefix)
            import fastlmm.util.util as ut
            ut.write_plink_covariates(self.ind_iid, self.pcs, filename)

        logging.info("performing final scan through entire data set")
        tt0 = time.time()

        _F, _pval = self.G.lin_reg(self.y, self.X)

        feat_idx = np.argsort(_pval)

        _pval_feat_idx = _pval[feat_idx]
        sid_feat_idx = self.sid[feat_idx]
        lingreg_results = (_pval[feat_idx], self.sid[feat_idx])
        logging.info("fin_scan time %.2f s" % (float(time.time() - tt0)))

        return lingreg_results
    def linreg_entire_dataset(self, output_prefix):
        self.run_once()

        # We do a little extra work here. We write the pcs out here because they will be known here and we know this method will only be run
        # once even on a big cluster jobs
        if output_prefix is not None and self.num_pcs > 0:
            filename = self.pcs_filename(output_prefix)
            import fastlmm.util.util as ut        
            ut.write_plink_covariates(self.ind_iid,self.pcs,filename)

        logging.info("performing final scan through entire data set")
        tt0 = time.time()

        _F,_pval = self.G.lin_reg(self.y, self.X)

        feat_idx = np.argsort(_pval)

        _pval_feat_idx = _pval[feat_idx]
        sid_feat_idx = self.sid[feat_idx]
        lingreg_results = (_pval[feat_idx],self.sid[feat_idx])
        logging.info("fin_scan time %.2f s" % (float(time.time() - tt0)))

        return lingreg_results