Example #1
0
def main():

    geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5'
    pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03'
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    graphics_prefix = os.path.join(out_graphics_dir,
                                   'calc_k_panama_2016-02-03-')
    results_prefix = os.path.join(out_dir, 'calc_k_panama_2016-02-03-')
    logger = LoggerFactory.get_logger(os.path.join(
        out_dir, 'calc_k_panama_2016-02-03.log'),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # import data
    phenotypes, sample_idx = dataset.getPhenotypes(intersection=True)
    sample_relatedness = dataset.getCovariance()

    # determine the number of ranks to consider in the PANAMA matrix
    # by looking at the variance explained by PCs
    cum_var = panama.PC_varExplained(phenotypes.values)
    out_file = graphics_prefix + 'cum_var.png'
    fig = plt.figure(figsize=[5, 4])
    subplt = pl.subplot(1, 1, 1)
    pl.bar(sp.arange(50) + 0.5, cum_var[:50], width=1)
    pl.xlim(0, 50)
    ticks = sp.linspace(0, 50, 11)
    ticks[0] = 1
    subplt.set_xticks(ticks)
    fig.savefig(out_file)
    plt.close(fig)

    for r in [10, 15, 20]:
        p = panama.PANAMA(Y=phenotypes.values, Kpop=sample_relatedness)
        logger.info('Training r=%d', r)
        p.train(rank=r)
        draw_and_save_panama(p, graphics_prefix + '_K%d' % r,
                             results_prefix + '_K%d' % r)
def main():

    geno_file, pheno_file, out_dir = sys.argv[1:]

    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5'
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #out_dir = '.'
    logger = LoggerFactory.get_logger(os.path.join(out_dir,
                                                   'get_geno_pos.log'))
    LoggerFactory.log_command(logger, sys.argv[1:])

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    #snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Writing output to directory %s', out_dir)
    position = position.astype(int)
    chromBounds = chromBounds.astype(int)
    position.to_csv(os.path.join(out_dir, 'position.txt'),
                    header=True,
                    index=False,
                    sep='\t')
    np.savetxt(os.path.join(out_dir, 'chromBounds.txt'),
               chromBounds,
               delimiter=",")
def main():

    geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #norm_mode = 'RIN'
    #out_dir = '.'
    #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5'
    #RNA_start,RNA_end = 0,2

    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading sample relatedness from %s',panama_file)
    panama_f = h5py.File(panama_file,'r')
    Ktot = panama_f['Ktot'][:]

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)

    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID)
    logger.info('Normalization: %s',norm_mode)
    if norm_mode=='None':
        phenotype_vals = phenotypes.values
    elif norm_mode=='RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode=='boxcox':
        phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Exit',norm_mode)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                   K=sample_relatedness,covs=cov,verbose=True)
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
                       K=Ktot,covs=None,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    #logger.info('Plotting LM vs LMM P-values')
    #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #    out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        # fig = plt.figure(figsize=[3,3])
        # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        # ymax = max(plt.xlim()[1],plt.ylim()[1])
        # plt.plot([0,ymax],[0,ymax],'k--')
        # plt.xlabel('LM')
        # plt.ylabel('LMM')
        # plt.title(p_ID)
        # fig.savefig(out_file)
        # plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Example #4
0
#--------------------#
#### Prepare data ####
#--------------------#

# Reader instance for genotypes
geno_reader = gr.genotype_reader_tables(
    '/home/hugot/projects/20150501_accessions/genotypes/snp250k/pygwas_genotypes_limix.hdf5'
)

# Reader instance for phenotypes
pheno_reader = phr.pheno_reader_tables(
    '/home/hugot/projects/20150501_accessions/phenotypes/limix/accession_phenotypes_silique_early.hdf5'
)

# Combine genotypes and phenotypes into limix-specific object
dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)

# Get SNPs, phenotypes and positions in respective variables
snps = dataset.getGenotypes()

phenotypes = dataset.getPhenotypes(intersection=True)[0]

pos = dataset.getPos()
pos, chromBounds = data_util.estCumPos(position=pos, offset=0)

# Subset only TSS trait for multi-trait LMM
phenotypes_tss = phenotypes[['totalbr_mean_ln', 'totalbr_mean_hn']]

# Estimate relatedness matrix
sample_relatedness = dataset.getCovariance(normalize=True, center=True)
def main():

    if 1:
        geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[
            1:]

    if 0:

        geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5'
        pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5'
        norm_mode = 'RIN'
        out_dir = 'test_v8'
        K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv'
        cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
        RNA_start, RNA_end = 0, 5

    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs'))
    logger = LoggerFactory.get_logger(os.path.join(
        log_dir, '%s-%s.log' % (RNA_start, RNA_end)),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])
    logger.info('Output directory: %s', out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results'))

    RNA_start, RNA_end = int(RNA_start), int(RNA_end)
    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes()  #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Sample relatedness %s', K_file)
    logger.info('Loading sample relatedness from %s', K_file)
    if (K_file == 'None'):
        sample_relatedness = None
    else:
        logger.info('Start loading covariance from %s', K_file)
        K_df = pd.read_csv(K_file, sep='\t', header=None,
                           index_col=0)  # accessions x accessions
        K_df.index = ['%d' % i for i in K_df.index]
        K_df.columns = K_df.index
        sample_relatedness = K_df.loc[dataset.sample_ID,
                                      dataset.sample_ID].as_matrix()
    sample_relatedness_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'sample_relatedness'))
    pl.imshow(sample_relatedness, aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png'))

    logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\
                                          sample_idx=dataset.sample_idx['pheno'])

    logger.info('Phenotype normalization: %s', norm_mode)
    if norm_mode == 'None':
        phenotype_vals = phenotypes.values
    elif norm_mode == 'RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode == 'boxcox':
        phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Use None',
                    norm_mode)
        phenotype_vals = phenotypes.values

    N = snps.shape[0]  #number of individuals
    S = snps.shape[1]  #number of SNPs
    P = phenotype_vals.shape[1]  #number of phenotypes
    logger.info(
        'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
        N, S, P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Sample covariance %s', cov_file)
    if (cov_file == 'None'):
        cov = None
    else:
        logger.info('Start loading covariance from %s', cov_file)
        cov_df = pd.read_csv(cov_file, sep='\t', header=0,
                             index_col=0)  # cov x accessions
        cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),
                       pheno=phenotype_vals,
                       K=sample_relatedness,
                       covs=cov,
                       verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,
                               index=dataset.geno_ID,
                               columns=phenotype_ID)

    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(
        os.path.join(out_results_dir, 'lmm_pval'))
    logger.info('Saving P-values to text file in %s', lmm_pval_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID),
                                 header=True,
                                 index=False)

    # Genome-wide manhatton plots for one phenotype:
    manh_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'manhattan'))
    logger.info('Plotting Manhattan plots in %s', manh_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[12, 8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],
                       pv=pvalues_lmm[p_ID].values,
                       chromBounds=chromBounds,
                       thr_plotting=0.05)
        plt.title('%s, LMM' % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # SNP vs. phenotype
    snp_pheno_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'snp_pheno'))
    logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\
                                          sample_idx=dataset.sample_idx['pheno'])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx, imax] == 0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx, imax] +
                 0.05 * np.random.randn(snps[s_idx, imax].shape[0]),
                 pheno_vals.values,
                 '.',
                 alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5, 2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    pval_hist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'pval_hist'))
    logger.info('Plotting P-value histograms to %s', pval_hist_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values, 20, normed=True)
        plt.plot([0, 1], [1, 1], "r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

# Quantile-Quantile plots
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,
                                                    'qqplot'))
    logger.info('Plotting Q-Q plots to %s', qqplot_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)

        fig.savefig(out_file)
        plt.close(fig)

    # P value scatter plot
    # logger.info('Plotting LM vs LMM P-values')
    # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #     out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
    #     fig = plt.figure(figsize=[3,3])
    #     plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
    #     ymax = max(plt.xlim()[1],plt.ylim()[1])
    #     plt.plot([0,ymax],[0,ymax],'k--')
    #     plt.xlabel('LM')
    #     plt.ylabel('LMM')
    #     plt.title(p_ID)
    #     fig.savefig(out_file)
    #     plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
def main():

    geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5'
    #out_dir = '.'
    #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
    #RNA_start,RNA_end = 0,5
    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Calculating sample relatedness')
    # non-normalized and normalized sample relatedeness matrix
    sample_relatedness_unnormalized = dataset.getCovariance(normalize=False)
    sample_relatedness  = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean()
    sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness'))
    pl.imshow(sample_relatedness,aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png'))

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Start loading covariance from %s',cov_file)
    cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions
    cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()
    logger.info('Finished')

    logger.info('Start testing: LM')
    lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                     covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
                              columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                       K=sample_relatedness,covs=cov,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
                                header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        subpl = plt.subplot(2,1,1)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LM'%p_ID)
        subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        subpl = plt.subplot(1,2,1)
        plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")

        subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        subpl = plt.subplot(1,2,1)
        qqplot(pvalues_lm[p_ID].values)
        plt.title("%s, LM" % p_ID)
        subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    logger.info('Plotting LM vs LMM P-values')
    pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])
        plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        ymax = max(plt.xlim()[1],plt.ylim()[1])
        plt.plot([0,ymax],[0,ymax],'k--')
        plt.xlabel('LM')
        plt.ylabel('LMM')
        plt.title(p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Example #7
0
def main():

    geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1kgen_filter3.hdf5'
    pheno_file = '/gale/netapp/home/shhuang/data/1001_genomes/seed_size/accx_size.hdf5'
    expr_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_normCounts_k4_vst2T.hdf5'
    out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/draw_seedsize_plots_2016-11-30'
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    graphics_prefix = os.path.join(out_graphics_dir,
                                   'draw_seedsize_plots_2016-11-30-')
    results_prefix = os.path.join(out_dir, 'draw_seedsize_plots_2016-11-30-')
    logger = LoggerFactory.get_logger(os.path.join(
        out_dir, 'draw_seedsize_plots_2016-11-30.log'),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading expression from %s', expr_file)
    expr_reader = phr.pheno_reader_tables(expr_file)
    expr_reader.sample_ID = strip_xvec(expr_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    exprset = data.QTLData(geno_reader=geno_reader, pheno_reader=expr_reader)
    # import data
    #phenotypes,sample_idx = dataset.getPhenotypes(intersection=False)

    pheno_sample_select = np.ones(pheno_reader.sample_ID.shape[0], dtype=bool)
    phenotypes, pheno_sample_idx = pheno_reader.getPhenotypes(
        sample_idx=pheno_sample_select)
    expr_sample_select = np.ones(expr_reader.sample_ID.shape[0], dtype=bool)
    expr, expr_sample_idx = expr_reader.getPhenotypes(
        sample_idx=expr_sample_select)

    snps = geno_reader.getGenotypes()
    position = geno_reader.getPos()
    position, chromBounds = data_util.estCumPos(position=position, offset=0)
    gid_start, gid_end = geno_reader.getGenoIndex(chrom=4,
                                                  pos_start=(4, 13393142),
                                                  pos_end=(4, 13393144))
    gid_range = np.arange(gid_start, gid_end + 1)

    for ig, g_ID in enumerate(gid_range):

        g_ID = gid_range[ig:(ig + 1)]
        print(g_ID)
        gs_idx = dataset.sample_idx["geno"].values
        ps_idx = dataset.sample_idx["pheno"].values
        egs_idx = exprset.sample_idx["geno"].values
        eps_idx = exprset.sample_idx["pheno"].values

        snps_sub = snps[np.ix_(gs_idx, g_ID)][:, 0]
        phenotypes_sub = phenotypes.values[ps_idx]
        esnps_sub = snps[np.ix_(egs_idx, g_ID)][:, 0]
        fba5_sub = expr['AT4G26530'].values[eps_idx]
        position_sub = position.iloc[[g_ID[0]]]
        print(position_sub)

        point_file = graphics_prefix + 'point_chr%d_%d.png' % (
            position_sub['chrom'], position_sub['pos'])
        fig = plt.figure(figsize=[5, 2.5])  #create the figure
        plt.subplot(1, 2, 1)
        plt.plot(snps_sub + 0.05 * np.random.randn(snps_sub.shape[0]),
                 phenotypes_sub, '.')
        plt.xlabel("SNP")
        plt.ylabel("Seed size")
        plt.subplot(1, 2, 2)
        plt.plot(esnps_sub + 0.05 * np.random.randn(esnps_sub.shape[0]),
                 fba5_sub, '.')
        plt.xlabel("SNP")
        plt.ylabel("FBA5 expression")
        plt.tight_layout()
        fig.savefig(point_file)
        plt.close(fig)

        bxp_file = graphics_prefix + 'bxp_chr%d_%d.png' % (
            position_sub['chrom'], position_sub['pos'])
        fig = plt.figure(figsize=[5, 2.5])  #create the figure
        plt.subplot(1, 2, 1)
        phenotypes_box = [
            phenotypes_sub[snps_sub == 0], phenotypes_sub[snps_sub == 2]
        ]
        plt.boxplot(phenotypes_box)
        plt.xlabel("SNP")
        plt.ylabel("Seed size")
        plt.subplot(1, 2, 2)
        fba5_box = [fba5_sub[esnps_sub == 0], fba5_sub[esnps_sub == 2]]
        plt.boxplot(fba5_box)
        plt.xlabel("SNP")
        plt.ylabel("FBA5 expression")
        plt.tight_layout()
        fig.savefig(bxp_file)
        plt.close(fig)