Esempio n. 1
0
def run_lmm(use_kinship, peer_cov, Xc, Y, cov, K):
    '''this functions computes the lmm model using X (genotype), Y(phenotype). Optional: cov(matrix of covariance), K(kinship)'''
    if use_kinship == 'y':
        if peer_cov == 'n':  #if no covariates were used with peer then account for cov in the model
            sys.stderr.write(
                '\nrunning lmm = QTL.test_lmm(Xc,Y,covs=cov,K=K)...')
            lmm = QTL.test_lmm(Xc, Y, covs=cov, K=K)
        else:
            sys.stderr.write('\nrunning lmm = QTL.test_lmm(Xc,Y,K=K)...')
            lmm = QTL.test_lmm(
                Xc, Y, K=K
            )  # otherwise exclude covariates from the model since already used in peer
    else:  #no kinship
        if peer_cov == 'n':  #if no cov where used with peer then
            sys.stderr.write(
                '\nrunning lmm= QTL.test_lmm(Xc,Y,covs=cov,K=SP.eye(Xc.shape[0]))...'
            )
            lmm = QTL.test_lmm(Xc, Y, covs=cov, K=SP.eye(
                Xc.shape[0]))  #include covariates in the model
        else:
            sys.stderr.write(
                '\nrunning lmm = QTL.test_lmm(Xc,Y,K=SP.eye(Xc.shape[0]))...')
            lmm = QTL.test_lmm(
                Xc, Y, K=SP.eye(Xc.shape[0])
            )  #exclude cov in the model since already used by peer
    return lmm
Esempio n. 2
0
def run_lmm(ts, RNA, COV, i, transK, out_csv):
    logger = logging.getLogger()
    y = RNA[:, i]

    # Mixed model
    logger.debug('Start lmm')
    lmm = qtl.test_lmm(snps=ts, pheno=y, K=transK, covs=COV, verbose=True)
    logger.debug('Done lmm')
    pvalues_lmm = pd.DataFrame(data=lmm.getPv().T,
                               index=range(0, ts.shape[1]),
                               columns=['lmm'])

    # Linear regression model
    logger.debug('Start lm')
    lm = qtl.test_lmm(snps=ts, pheno=y, covs=COV, verbose=True)
    logger.debug('Done lm')
    pvalues_lm = pd.DataFrame(data=lm.getPv().T,
                              index=range(0, ts.shape[1]),
                              columns=['lm'])

    # Export
    logger.debug('export')
    pval = pd.concat([pvalues_lmm, pvalues_lm], axis=1)
    #np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.6e')
    np.savetxt(out_csv, pval, delimiter='\t', fmt='%.6e')
def run_lmm(ts,RNA,COV,i,transK,out_csv):
    y=RNA[:,i]
    # Mixed model
    lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK, covs=COV)
    pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=['lmm'])
    
    # Linear regression model
    lm=qtl.test_lmm(snps=ts, pheno=y, covs=COV)
    pvalues_lm=pd.DataFrame(data=lm.getPv().T, index=range(0,ts.shape[1]), columns=['lm'])
    
    # Export
    pval=pd.concat([pvalues_lmm,pvalues_lm], axis=1)
    #np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.6e')
    np.savetxt(out_csv, pval, delimiter='\t', fmt='%.6e')
Esempio n. 4
0
def test_lmm_lr(G, y, Z, Kbg, Covs=None):
    """
    low-rank lmm

    input:
    G   :   genotypes
    y   :   phenotype
    Z   :   features of low-rank matrix
    Kbg   :   background covariance matrix
    Covs :  fixed effect covariates
    """

    vd = varianceDecomposition.VarianceDecomposition(y)
    if Covs is not None:
        vd.addFixedEffect(Covs)
    vd.addRandomEffect(Kbg)
    Klr = utils.computeLinearKernel(Z)
    vd.addRandomEffect(Klr)
    vd.addRandomEffect(is_noise=True)
    vd.optimize()

    varComps = vd.getVarianceComps()[0]
    Ktotal = varComps[0] * Kbg + varComps[1] * Klr

    lm = qtl.test_lmm(G, y, covs=Covs, K=Ktotal)
    pv = lm.getPv()[0]
    beta = lm.getBetaSNP()[0]

    var_snps = beta**2 * np.var(G, axis=0)
    var_genes = np.zeros(len(beta)) + varComps[1]
    var_covs = np.zeros(len(beta))
    if Covs is not None: var_covs += np.dot(Covs, vd.getWeights()).var()

    return pv, beta, var_snps, var_covs, var_genes
Esempio n. 5
0
def test_lmm_lr(G, y, Z, Kbg, Covs=None):
    """
    low-rank lmm

    input:
    G   :   genotypes
    y   :   phenotype
    Z   :   features of low-rank matrix
    Kbg   :   background covariance matrix
    Covs :  fixed effect covariates
    """

    vd = varianceDecomposition.VarianceDecomposition(y)
    if Covs is not None:
        vd.addFixedEffect(Covs)
    vd.addRandomEffect(Kbg)
    Klr = utils.computeLinearKernel(Z)
    vd.addRandomEffect(Klr)
    vd.addRandomEffect(is_noise=True)
    vd.optimize()

    varComps = vd.getVarianceComps()[0]
    Ktotal = varComps[0]*Kbg + varComps[1]*Klr

    lm = qtl.test_lmm(G,y,covs=Covs,K=Ktotal)
    pv = lm.getPv()[0]
    beta = lm.getBetaSNP()[0]
    
    var_snps =  beta**2 * np.var(G,axis=0)
    var_genes = np.zeros(len(beta)) + varComps[1]
    var_covs  = np.zeros(len(beta))
    if Covs is not None: var_covs += np.dot(Covs, vd.getWeights()).var()
   

    return pv, beta, var_snps, var_covs, var_genes
Esempio n. 6
0
    def initial_scan(self,
                     startSnpIdx=0,
                     nSnps=np.inf,
                     memory_efficient=False):
        """
        running initial scan using a linear mixed model

        input:
        startSnpIdx   :    index of first snp (default : 0)
        nSnps         :    number of SNPs to use (default: infinite)
        memory_efficient : if turned on (default: false), phenotype are processed sequentially,
                           leading to longer runtime but less memory.
        """
        F = self.genoreader.get_nrows()
        T = self.phenoreader.get_nrows()

        N = self.genoreader.get_ncols()

        if ~np.isfinite(nSnps):
            nSnps = F

        nSnps = min(nSnps, F - startSnpIdx)

        G = self.genoreader.loadSnpBlock(startSnpIdx, nSnps)

        if memory_efficient:
            pv = np.zeros((nSnps, T))
            beta = np.zeros((nSnps, T))
            for t in range(T):
                y = self.phenoreader.getRows([t]).T
                lm = qtl.test_lmm(snps=G.T, pheno=y, K=self.K, covs=self.Covs)
                pv[:, t] = lm.getPv()[0]
                beta[:, t] = lm.getBetaSNP()[0]
        else:
            Y = self.phenoreader.getMatrix()
            lm = qtl.test_lmm(snps=G.T, pheno=Y.T, K=self.K, covs=self.Covs)
            pv = lm.getPv().T
            beta = lm.getBetaSNP().T

        self.assoc0_reader = reader.MatrixReader(pv)

        return beta, pv
Esempio n. 7
0
def run_lmm_chunk(ts,RNA,COV,RNA_start,RNA_end,RNA_columns,transK,out_csv):
    logger = logging.getLogger()
    y=RNA[:,RNA_start:RNA_end]

    # Mixed model
    logger.debug('Start lmm')
    lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK, covs=COV,verbose=True)
    logger.debug('Done lmm')
    pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=RNA_columns)
    logger.debug('Export')
    #np.savetxt(out_csv, pvalues_lmm, delimiter='\t', fmt='%.6e')
    pvalues_lmm.to_csv(out_csv,sep='\t',header=True,index=False)
Esempio n. 8
0
    def initial_scan(self, startSnpIdx=0, nSnps=np.inf, memory_efficient=False):
        """
        running initial scan using a linear mixed model

        input:
        startSnpIdx   :    index of first snp (default : 0)
        nSnps         :    number of SNPs to use (default: infinite)
        memory_efficient : if turned on (default: false), phenotype are processed sequentially,
                           leading to longer runtime but less memory.
        """
        F = self.genoreader.get_nrows()
        T = self.phenoreader.get_nrows()

        N = self.genoreader.get_ncols()

        if ~np.isfinite(nSnps):
            nSnps = F

        nSnps = min(nSnps, F - startSnpIdx)

        G = self.genoreader.loadSnpBlock(startSnpIdx, nSnps)

        if memory_efficient:
            pv = np.zeros((nSnps, T))
            beta = np.zeros((nSnps, T))
            for t in range(T):
                y = self.phenoreader.getRows([t]).T
                lm = qtl.test_lmm(snps=G.T, pheno=y, K=self.K, covs=self.Covs)
                pv[:, t] = lm.getPv()[0]
                beta[:, t] = lm.getBetaSNP()[0]
        else:
            Y = self.phenoreader.getMatrix()
            lm = qtl.test_lmm(snps=G.T, pheno=Y.T, K=self.K, covs=self.Covs)
            pv = lm.getPv().T
            beta = lm.getBetaSNP().T

        self.assoc0_reader = reader.MatrixReader(pv)

        return beta, pv
Esempio n. 9
0
    def fitLMM(self,
               K=None,
               tech_noise=None,
               idx=None,
               i0=None,
               i1=None,
               verbose=False):
        """
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			idx:			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progresses
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""
        assert self.var is not None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
        #		print QTL

        if idx is None:
            if i0 is None or i1 is None:
                i0 = 0
                i1 = self.G
            idx = SP.arange(i0, i1)
        elif not isinstance(idx, SP.ndarray):
            idx = SP.array([idx])

        if K is not None and not isinstance(K, list):
            K = [K]

        lmm_params = {
            'covs': SP.ones([self.N, 1]),
            'NumIntervalsDeltaAlt': 100,
            'NumIntervalsDelta0': 100,
            'searchDelta': True
        }

        Ystd = self.Y - self.Y.mean(0)
        Ystd /= self.Y.std(0)

        beta = SP.zeros((idx.shape[0], self.G))
        pv = SP.zeros((idx.shape[0], self.G))
        geneID = SP.zeros(idx.shape[0], dtype=str)
        count = 0
        var = self.var / self.var.sum(1)[:, SP.newaxis]
        for ids in idx:
            if verbose:
                print('.. fitting gene %d' % ids)
            # extract a single gene
            if K is not None:
                if len(K) > 1:
                    if self.var_info['conv'][count] == True:
                        _K = SP.sum(
                            [var[count, i] * K[i] for i in range(len(K))], 0)
                        _K /= _K.diagonal().mean()
                    else:
                        _K = None
                else:
                    _K = K[0]
            else:
                _K = None
            lm = QTL.test_lmm(Ystd,
                              Ystd[:, ids:ids + 1],
                              K=_K,
                              verbose=False,
                              **lmm_params)
            pv[count, :] = lm.getPv()[0, :]
            beta[count, :] = lm.getBetaSNP()[0, :]
            if self.geneID is not None: geneID[count] = self.geneID[ids]
            count += 1

        info = {'conv': self.var_info['conv'], 'gene_idx_row': idx}
        if geneID is not None: info['gene_row'] = geneID

        return pv, beta, info
Esempio n. 10
0
 snp_pos = info_df['pos'].values
 snp_idx = np.logical_and((snp_pos > min_pos), (snp_pos < max_pos))
 if not snp_idx.any():
     continue
 X_gene = X[:, snp_idx]
 if permute:
     X_gene = np.random.permutation(X_gene)
 info_gene = info_df.iloc[snp_idx]
 for item in info_gene.columns:
     out_dict[item] = info_gene[item].values
 assoc_gene = gene.repeat(snp_idx.sum())
 out_dict['assoc_gene'] = assoc_gene
 # Run the LMM analysis
 print "   .. single trait analysis"
 if fit_design:
     lmm = QTL.test_lmm(X_gene, Y_gene, K=K, covs=design)
 else:
     lmm = QTL.test_lmm(X_gene, Y_gene, K=K)
 pv = lmm.getPv()
 pv[0][np.isnan(pv[0])] = 1.0  # set any NaN p-values to 1
 out_dict['pv'] = pv[0]
 out_dict['qv'] = FDR.qvalues(pv)[0]
 out_dict['beta'] = lmm.getBetaSNP()[0]
 lambda_val = getLambda(pv)
 lambda_val = lambda_val.repeat(len(out_dict['pv']))
 out_dict['lambda_val'] = lambda_val
 out_df = pd.DataFrame(out_dict, index=out_dict['gdid'])
 ## convert full stops in gene name to underscore
 gene = gene.replace(".", "_")
 ## append results for chunk to gene's results df to HDF5 file
 print "    ....appending results..."
Esempio n. 11
0
        print '   .. Importing data'
        try:
            Xc, info = data.getGenotypes(gene, return_info=True)
        except:
            print 'Error: no SNPs found in cis'
            continue
        Y = data.getPhenotypes(gene, peer=opt.peer, gauss=True)
        o = gene_group.create_group('snp_info')
        smartDumpDictHdf5(info, o)

        if opt.perm:
            if opt.seed is not None:
                sp.random.seed(opt.seed)
            idxs = sp.random.permutation(Xc.shape[0])
            Xc = Xc[idxs, :]

        if 1:
            print "   .. single trait analysis"
            lmm = QTL.test_lmm(Xc, Y, K=K)
            pv = lmm.getPv()
            RV = {}
            RV['pv'] = pv
            RV['qv'] = FDR.qvalues(pv)
            RV['beta'] = lmm.getBetaSNP()
            RV['lambda'] = getLambda(pv)
            o = gene_group.create_group('st')
            smartDumpDictHdf5(RV, o)

    fout.close()
def main():

    geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #norm_mode = 'RIN'
    #out_dir = '.'
    #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5'
    #RNA_start,RNA_end = 0,2

    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading sample relatedness from %s',panama_file)
    panama_f = h5py.File(panama_file,'r')
    Ktot = panama_f['Ktot'][:]

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)

    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID)
    logger.info('Normalization: %s',norm_mode)
    if norm_mode=='None':
        phenotype_vals = phenotypes.values
    elif norm_mode=='RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode=='boxcox':
        phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Exit',norm_mode)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                   K=sample_relatedness,covs=cov,verbose=True)
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
                       K=Ktot,covs=None,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    #logger.info('Plotting LM vs LMM P-values')
    #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #    out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        # fig = plt.figure(figsize=[3,3])
        # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        # ymax = max(plt.xlim()[1],plt.ylim()[1])
        # plt.plot([0,ymax],[0,ymax],'k--')
        # plt.xlabel('LM')
        # plt.ylabel('LMM')
        # plt.title(p_ID)
        # fig.savefig(out_file)
        # plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Esempio n. 13
0
ts = (ts-ts.mean(axis=0))/ts.std(axis=0)
transk = np.dot(ts,ts.T)
#transk

## Scaling Kinship matrix (from the Bjarni's scale_k())
c = sp.sum((sp.eye(len(transk)) - (1.0 / len(transk)) * sp.ones(transk.shape)) * sp.array(transk))
scalar = (len(transk) - 1) / c
transK = scalar * transk

RNA = RNA[:500,:]
ts = ts[:500,:]
transK = transK[:500,:500]

for i in range(0, RNA.shape[1]):
    
    y=RNA[:,i]
    # Mixed model
    lmm=qtl.test_lmm(snps=ts, pheno=y,K=transK)
    pvalues_lmm=pd.DataFrame(data=lmm.getPv().T, index=range(0,ts.shape[1]), columns=['lmm'])
    
    # Linear regression model
    lm=qtl.test_lmm(snps=ts, pheno=y)
    pvalues_lm=pd.DataFrame(data=lm.getPv().T, index=range(0,ts.shape[1]), columns=['lm'])
    
    # Export
    pval=pd.concat([pvalues_lmm,pvalues_lm], axis=1)
    np.savetxt(Out+'/'+str(i)+'.csv', pval, delimiter=',', fmt='%.3e')



Esempio n. 14
0
	def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True):
		"""
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			expr:				correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused 	
			idx:
			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progress
			recalc:			if True, re-do variance decomposition
			standardize:		if True, standardize also expression 
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""
		

		if idx==None:
			if i0==None or i1==None:
				i0 = 0; i1 = self.G
			idx = SP.arange(i0,i1)
		elif type(idx)!=SP.ndarray:
			idx = SP.array(idx)
		idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell

		
		if K!=None:
			if type(K)!=list:	K = [K]
			if (recalc==True and len(K)>1) or (recalc==True and self.var==None):
				print 'performing variance decomposition first...'
				var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) 
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
			elif recalc==False and len(K)>1:
				assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
				warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.')
				var_raw = self.var
 				var_info = self.var_info
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
		
		lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True}
				
				
		Yidx = self.Y[:,idx]
		Ystd = Yidx-Yidx.mean(0)
		Ystd/= Yidx.std(0) #delta optimization might be more efficient
		
		if expr==None:
			expr = Ystd		
		elif standardize==True:
			exprStd = expr
			exprStd = expr-expr.mean(0)
			exprStd/= expr.std(0)
			expr = exprStd

		_G1	  = idx.shape[0]
		_G2	 = expr.shape[1]

		geneID = SP.zeros(_G1,dtype=str)
		
		beta   = SP.zeros((_G1,_G2))
		pv	 = SP.zeros((_G1,_G2))
		count  = 0
		
		for ids in range(_G1):
			if verbose:
				print '.. fitting gene %d'%ids
			# extract a single gene
			if K!=None:
				if len(K)>1:
					if var_info['conv'][count]==True:
						_K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0)
						_K/= _K.diagonal().mean()
					else:
						_K = None
				else:
					_K = K[0]
			else:
				_K = None
			lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,**lmm_params)
			pv[count,:]   = lm.getPv()[0,:]
			beta[count,:] = lm.getBetaSNP()[0,:]
			count+=1

		if self.geneID!=None:   geneID = SP.array(self.geneID)[idx]
		if recalc==True and K!=None  and len(K)>1:	
			info = {'conv':var_info['conv'],'gene_idx_row':idx}
		else:	
			info = {'gene_idx_row':idx}
		if geneID!=None:	info['gene_row'] = geneID

		return pv, beta, info
Esempio n. 15
0
snps = data_subsample.getGenotypes(impute_missing=True)
phenotypes,sample_idx = data_subsample.getPhenotypes(phenotype_query=phenotype_query,intersection=True); assert sample_idx.all()

sample_relatedness = data_subsample.getCovariance()
position = data_subsample.getPos()

#set parameters for the analysis
N, P = phenotypes.shape          

covs = None                 #covariates
searchDelta = False         #specify if delta should be optimized for each SNP
test="lrt"                  #specify type of statistical test

# Running the analysis
# when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1)))
lmm = QTL.test_lmm(snps=snps,pheno=phenotypes.values,K=sample_relatedness,covs=covs,test=test)

pvalues = lmm.getPv()       # 1xS vector of p-values (S=X.shape[1])
#convert P-values to a DataFrame for nice output writing:
pvalues = pd.DataFrame(data=pvalues.T,index=data_subsample.geno_ID,columns=phenotypes.columns)
pvalues = pd.concat([position,pvalues],join="outer",axis=1)

betas = lmm.getBetaSNP()    # 1xS vector of effect sizes (S=X.shape[1])
#convert betas to a DataFrame for nice output writing:
betas = pd.DataFrame(data=betas.T,index=data_subsample.geno_ID,columns=phenotypes.columns)
betas = pd.concat([position,pvalues],join="outer",axis=1)

#create result DataFrame
result["pvalues"] = pvalues
result["betas"] = betas
    
Esempio n. 16
0
File: core.py Progetto: yumyai/scLVM
	def fitLMM(self,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False):
		"""
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			idx:			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progresses
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""
		assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
#		print QTL

		if idx==None:
			if i0==None or i1==None:
				i0 = 0; i1 = self.G
			idx = SP.arange(i0,i1)
		elif type(idx)!=SP.ndarray:
			idx = SP.array([idx])

		if K!=None and type(K)!=list:	K = [K]

		lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True}

		Ystd = self.Y-self.Y.mean(0)
		Ystd/= self.Y.std(0)

		beta   = SP.zeros((idx.shape[0],self.G))
		pv	 = SP.zeros((idx.shape[0],self.G))
		geneID = SP.zeros(idx.shape[0],dtype=str)
		count  = 0
		var = self.var/self.var.sum(1)[:,SP.newaxis] 
		for ids in idx:
			if verbose:
				print '.. fitting gene %d'%ids
			# extract a single gene
			if K!=None:
				if len(K)>1:
					if self.var_info['conv'][count]==True:
						_K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0)
						_K/= _K.diagonal().mean()
					else:
						_K = None
				else:
					_K = K[0]
			else:
				_K = None
			lm = QTL.test_lmm(Ystd,Ystd[:,ids:ids+1],K=_K,verbose=False,**lmm_params)
			pv[count,:]   = lm.getPv()[0,:]
			beta[count,:] = lm.getBetaSNP()[0,:]
			if self.geneID!=None:   geneID[count] = self.geneID[ids]
			count+=1

		info = {'conv':self.var_info['conv'],'gene_idx_row':idx}
		if geneID!=None:	info['gene_row'] = geneID

		return pv, beta, info
Esempio n. 17
0
sample_relatedness = data_subsample.getCovariance()
position = data_subsample.getPos()

#set parameters for the analysis
N, P = phenotypes.shape

covs = None  #covariates
searchDelta = False  #specify if delta should be optimized for each SNP
test = "lrt"  #specify type of statistical test

# Running the analysis
# when cov are not set (None), LIMIX considers an intercept (covs=SP.ones((N,1)))
lmm = QTL.test_lmm(snps=snps,
                   pheno=phenotypes.values,
                   K=sample_relatedness,
                   covs=covs,
                   test=test)

pvalues = lmm.getPv()  # 1xS vector of p-values (S=X.shape[1])
#convert P-values to a DataFrame for nice output writing:
pvalues = pd.DataFrame(data=pvalues.T,
                       index=data_subsample.geno_ID,
                       columns=phenotypes.columns)
pvalues = pd.concat([position, pvalues], join="outer", axis=1)

betas = lmm.getBetaSNP()  # 1xS vector of effect sizes (S=X.shape[1])
#convert betas to a DataFrame for nice output writing:
betas = pd.DataFrame(data=betas.T,
                     index=data_subsample.geno_ID,
                     columns=phenotypes.columns)
Esempio n. 18
0
			probe_group.create_dataset('start',data=SP.array([start]))
			probe_group.create_dataset('end',data=SP.array([end]))
			# one line per donor
			Xu = np.array(X, dtype='float')
			Yu = np.array(Y, dtype='float')
			#Yu -= Yu.mean(0); Yu /= Yu.std(0)
			if center:
				Xu -= Xu.mean(0); Xu /= Xu.std(0)
			uKcis    = SP.dot(Xu,Xu.T)
			uKtrans  = uKpop-uKcis
			uKcis   /= uKcis.diagonal().mean()
			uKtrans /= uKtrans.diagonal().mean()
			#4.3 perform experiment and store results in out_gene
			out_gene = {}
			#print "cis scan"
			lm=QTL.test_lmm(snps=Xu,pheno=Yu,K=uKtrans,covs=uCov,verbose=True)
			pv=lm.getPv()
			RV = {}
			RV['pv'] = pv
			RV['qv'] = FDR.qvalues(pv)[0]
			RV['lambd']   = getLambda(pv)
			RV['beta'] = lm.getBetaSNP()
			RV['posLead'] = SP.array([getRealPos(info['pos'][pv[0,:].argmin()],start,end,strand)])
			RV['aDirPosLead'] = abs(SP.array([info['pos'][pv[0,:].argmin()]-0.5*(start+end)]))
			out_group = probe_group.create_group('lmm')
			dumpDictHdf5(RV,out_group)
			#print 'ok'
		except:
			continue

	f.close()
Esempio n. 19
0
                            vcperm.addFixedEffect()
                            vcperm.addRandomEffect(K=Kallperm)
                            vcperm.addRandomEffect(is_noise=True)
                            vcperm.optimize()
                            permlm0 = vcnull.getLML() - vcperm.getLML()
                            perm_file.write(
                                "\t".join(map(str, [permlm0, permlm1])) + "\n")
                        ## get trans PCs
                        S_R, U_R = sp.linalg.eigh(Kc)
                        F1 = U_R[:, ::-1][:, :10]
                        # add an intercept term
                        F1 = sp.concatenate([F1, sp.ones((F1.shape[0], 1))], 1)
                        test = "lrt"  #specify type of statistical test
                        lmm0 = qtl.test_lmm(snps=Msnps,
                                            pheno=Y,
                                            K=Kallstd,
                                            covs=F1,
                                            test=test)
                        pvalues = lmm0.getPv(
                        )  # 1xS vector of p-values (S=X.shape[1])
                        betas = lmm0.getBetaSNP(
                        )  # 1xS vector of effect sizes (S=X.shape[1])
                        ses = lmm0.beta_ste  # 1xS vector of effect sizes standard errors (S=X.shape[1]
                        RV = Mpos
                        RV["pvaluesCisPCs"] = pvalues.T
                        RV["betasCisPCs"] = betas.T
                        RV["sesCisPCs"] = ses.T
                        RV["gene"] = gene

                        test = "lrt"  #specify type of statistical test
                        lmm2 = qtl.test_lmm(snps=Msnps,
def main():

    geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5'
    #out_dir = '.'
    #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
    #RNA_start,RNA_end = 0,5
    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Calculating sample relatedness')
    # non-normalized and normalized sample relatedeness matrix
    sample_relatedness_unnormalized = dataset.getCovariance(normalize=False)
    sample_relatedness  = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean()
    sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness'))
    pl.imshow(sample_relatedness,aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png'))

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Start loading covariance from %s',cov_file)
    cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions
    cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()
    logger.info('Finished')

    logger.info('Start testing: LM')
    lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                     covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
                              columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                       K=sample_relatedness,covs=cov,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
                                header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        subpl = plt.subplot(2,1,1)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LM'%p_ID)
        subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        subpl = plt.subplot(1,2,1)
        plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")

        subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        subpl = plt.subplot(1,2,1)
        qqplot(pvalues_lm[p_ID].values)
        plt.title("%s, LM" % p_ID)
        subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    logger.info('Plotting LM vs LMM P-values')
    pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])
        plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        ymax = max(plt.xlim()[1],plt.ylim()[1])
        plt.plot([0,ymax],[0,ymax],'k--')
        plt.xlabel('LM')
        plt.ylabel('LMM')
        plt.title(p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
def main():

    if 1:
        geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[
            1:]

    if 0:

        geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5'
        pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5'
        norm_mode = 'RIN'
        out_dir = 'test_v8'
        K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv'
        cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
        RNA_start, RNA_end = 0, 5

    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs'))
    logger = LoggerFactory.get_logger(os.path.join(
        log_dir, '%s-%s.log' % (RNA_start, RNA_end)),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])
    logger.info('Output directory: %s', out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results'))

    RNA_start, RNA_end = int(RNA_start), int(RNA_end)
    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes()  #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Sample relatedness %s', K_file)
    logger.info('Loading sample relatedness from %s', K_file)
    if (K_file == 'None'):
        sample_relatedness = None
    else:
        logger.info('Start loading covariance from %s', K_file)
        K_df = pd.read_csv(K_file, sep='\t', header=None,
                           index_col=0)  # accessions x accessions
        K_df.index = ['%d' % i for i in K_df.index]
        K_df.columns = K_df.index
        sample_relatedness = K_df.loc[dataset.sample_ID,
                                      dataset.sample_ID].as_matrix()
    sample_relatedness_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'sample_relatedness'))
    pl.imshow(sample_relatedness, aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png'))

    logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\
                                          sample_idx=dataset.sample_idx['pheno'])

    logger.info('Phenotype normalization: %s', norm_mode)
    if norm_mode == 'None':
        phenotype_vals = phenotypes.values
    elif norm_mode == 'RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode == 'boxcox':
        phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Use None',
                    norm_mode)
        phenotype_vals = phenotypes.values

    N = snps.shape[0]  #number of individuals
    S = snps.shape[1]  #number of SNPs
    P = phenotype_vals.shape[1]  #number of phenotypes
    logger.info(
        'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
        N, S, P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Sample covariance %s', cov_file)
    if (cov_file == 'None'):
        cov = None
    else:
        logger.info('Start loading covariance from %s', cov_file)
        cov_df = pd.read_csv(cov_file, sep='\t', header=0,
                             index_col=0)  # cov x accessions
        cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),
                       pheno=phenotype_vals,
                       K=sample_relatedness,
                       covs=cov,
                       verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,
                               index=dataset.geno_ID,
                               columns=phenotype_ID)

    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(
        os.path.join(out_results_dir, 'lmm_pval'))
    logger.info('Saving P-values to text file in %s', lmm_pval_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID),
                                 header=True,
                                 index=False)

    # Genome-wide manhatton plots for one phenotype:
    manh_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'manhattan'))
    logger.info('Plotting Manhattan plots in %s', manh_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[12, 8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],
                       pv=pvalues_lmm[p_ID].values,
                       chromBounds=chromBounds,
                       thr_plotting=0.05)
        plt.title('%s, LMM' % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # SNP vs. phenotype
    snp_pheno_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'snp_pheno'))
    logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\
                                          sample_idx=dataset.sample_idx['pheno'])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx, imax] == 0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx, imax] +
                 0.05 * np.random.randn(snps[s_idx, imax].shape[0]),
                 pheno_vals.values,
                 '.',
                 alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5, 2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    pval_hist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'pval_hist'))
    logger.info('Plotting P-value histograms to %s', pval_hist_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values, 20, normed=True)
        plt.plot([0, 1], [1, 1], "r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

# Quantile-Quantile plots
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,
                                                    'qqplot'))
    logger.info('Plotting Q-Q plots to %s', qqplot_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)

        fig.savefig(out_file)
        plt.close(fig)

    # P value scatter plot
    # logger.info('Plotting LM vs LMM P-values')
    # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #     out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
    #     fig = plt.figure(figsize=[3,3])
    #     plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
    #     ymax = max(plt.xlim()[1],plt.ylim()[1])
    #     plt.plot([0,ymax],[0,ymax],'k--')
    #     plt.xlabel('LM')
    #     plt.ylabel('LMM')
    #     plt.title(p_ID)
    #     fig.savefig(out_file)
    #     plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Esempio n. 22
0
File: core.py Progetto: xuxaxy/scLVM
	def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True):
		"""
		Args:
			K:				list of random effects to be considered in the analysis
							if K is none, it does not consider any random effect
			expr:				correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused 	
			idx:
			indices of the genes to be considered in the analysis
			i0:				gene index from which the anlysis starts
			i1:				gene index to which the analysis stops
			verbose:		if True, print progress
			recalc:			if True, re-do variance decomposition
			standardize:		if True, standardize also expression 
		Returns:
			pv:				matrix of pvalues
			beta:			matrix of correlations
			info:			dictionary annotates pv and beta rows and columns, containing
							gene_idx_row:	index of the genes in rows
							conv:		boolean vetor marking genes for which variance decomposition has converged
							gene_row:   annotate rows of matrices
		"""
		

		if idx==None:
			if i0==None or i1==None:
				i0 = 0; i1 = self.G
			idx = SP.arange(i0,i1)
		elif type(idx)!=SP.ndarray:
			idx = SP.array(idx)
		idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell

		
		if K!=None:
			if type(K)!=list:	K = [K]
			if (recalc==True and len(K)>1) or (recalc==True and self.var==None):
				print 'performing variance decomposition first...'
				var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False) 
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
			elif recalc==False and len(K)>1:
				assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
				warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.')
				var_raw = self.var
 				var_info = self.var_info
				var = var_raw/var_raw.sum(1)[:,SP.newaxis]
		
		lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True}
				
				
		Yidx = self.Y[:,idx]
		Ystd = Yidx-Yidx.mean(0)
		Ystd/= Yidx.std(0) #delta optimization might be more efficient
		
		if expr==None:
			expr = Ystd		
		elif standardize==True:
			exprStd = expr
			exprStd = expr-expr.mean(0)
			exprStd/= expr.std(0)
			expr = exprStd

		_G1	  = idx.shape[0]
		_G2	 = expr.shape[1]

		geneID = SP.zeros(_G1,dtype=str)
		
		beta   = SP.zeros((_G1,_G2))
		pv	 = SP.zeros((_G1,_G2))
		count  = 0
		
		for ids in range(_G1):
			if verbose:
				print '.. fitting gene %d'%ids
			# extract a single gene
			if K!=None:
				if len(K)>1:
					if var_info['conv'][count]==True:
						_K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0)
						_K/= _K.diagonal().mean()
					else:
						_K = None
				else:
					_K = K[0]
			else:
				_K = None
			lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,verbose=False,**lmm_params)   
			pv[count,:]   = lm.getPv()[0,:]
			beta[count,:] = lm.getBetaSNP()[0,:]
			count+=1

		if self.geneID!=None:   geneID = SP.array(self.geneID)[idx]
		if recalc==True and K!=None  and len(K)>1:	
			info = {'conv':var_info['conv'],'gene_idx_row':idx}
		else:	
			info = {'gene_idx_row':idx}
		if geneID!=None:	info['gene_row'] = geneID

		return pv, beta, info