Exemple #1
0
def main():

    geno_in, norm_cov, cov_out_hdf5, cov_out_csv = sys.argv[1:]

    logger = LoggerFactory.get_logger(cov_out_hdf5 + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    ## Import genotype data
    logger.info('Loading genotype from %s', geno_in)
    geno_reader = gr.genotype_reader_tables(geno_in)
    if (norm_cov == '1'):
        logger.info('Normalizing')
        norm = True
    else:
        logger.info('NOT normalizing')
        norm = False

    sample_relatedness = geno_reader.getCovariance(normalize=norm)

    logger.info('Saving covariance to HDF5 file %s', cov_out_hdf5)
    out_dict = {'Cov': sample_relatedness}
    o = h5py.File(cov_out_hdf5, 'w')
    util_functions.smartDumpDictHdf5(out_dict, o)
    o.close()

    logger.info('Saving covariance to CSV file %s', cov_out_csv)
    save_cov_in_text_format(cov_out_csv, sample_relatedness,
                            geno_reader.sample_ID)

    logger.info('Done!')
Exemple #2
0
def SingleTraitLM(inF1, inF2, ouF):

    geno_reader = gr.genotype_reader_tables(inF1)
    pheno_reader = phr.pheno_reader_tables(inF2)
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    geno = dataset.getGenotypes()
    position = dataset.getPos()
    pos,chromBounds = data_util.estCumPos(position=position,offset=0)

    ouFile = open(ouF, 'w')

    P_max = len(dataset.phenotype_ID)
    phenotype_ID = dataset.phenotype_ID[0:P_max]


    for p_ID in phenotype_ID[0:]:
        #phenotype_vals, sample_idx = dataset.getPhenotypes([pI], center=False)
        phenotype_vals, sample_idx = dataset.getPhenotypes([p_ID])
        phenotype_vals_ranks = preprocess.rankStandardizeNormal(phenotype_vals.values)
        lm_ranks = qtl.test_lm(snps=geno[sample_idx],pheno=phenotype_vals_ranks)
        pvalues_lm_ranks = pd.DataFrame(data=lm_ranks.pvalues.T,index=dataset.geno_ID,columns=[p_ID])
        pvt = lm_ranks.pvalues.T
        for i in xrange(pvt.shape[0]):
            p = pvt[i,0]
            if p <= SIG:
                ouFile.write('\t'.join([position['chrom'][i], str(position['pos'][i]), str(p), p_ID]) + '\n')
    ouFile.close()

    manhattonPlot(['NMD'],pvalues_lm_ranks,inF2,pos, chromBounds)

    pl.figure(figsize=[12,4])
    qqplot(pvalues_lm_ranks['NMD'].values)
    pl.savefig('pvalues-qqplot.pdf')
Exemple #3
0
def main():

    geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5'
    pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03'
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    graphics_prefix = os.path.join(out_graphics_dir,
                                   'calc_k_panama_2016-02-03-')
    results_prefix = os.path.join(out_dir, 'calc_k_panama_2016-02-03-')
    logger = LoggerFactory.get_logger(os.path.join(
        out_dir, 'calc_k_panama_2016-02-03.log'),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # import data
    phenotypes, sample_idx = dataset.getPhenotypes(intersection=True)
    sample_relatedness = dataset.getCovariance()

    # determine the number of ranks to consider in the PANAMA matrix
    # by looking at the variance explained by PCs
    cum_var = panama.PC_varExplained(phenotypes.values)
    out_file = graphics_prefix + 'cum_var.png'
    fig = plt.figure(figsize=[5, 4])
    subplt = pl.subplot(1, 1, 1)
    pl.bar(sp.arange(50) + 0.5, cum_var[:50], width=1)
    pl.xlim(0, 50)
    ticks = sp.linspace(0, 50, 11)
    ticks[0] = 1
    subplt.set_xticks(ticks)
    fig.savefig(out_file)
    plt.close(fig)

    for r in [10, 15, 20]:
        p = panama.PANAMA(Y=phenotypes.values, Kpop=sample_relatedness)
        logger.info('Training r=%d', r)
        p.train(rank=r)
        draw_and_save_panama(p, graphics_prefix + '_K%d' % r,
                             results_prefix + '_K%d' % r)
Exemple #4
0
def SingleTraitLM_ManhattonPlot(inF1, inF2, SigPhe):

    geno_reader = gr.genotype_reader_tables(inF1)
    pheno_reader = phr.pheno_reader_tables(inF2)
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    geno = dataset.getGenotypes()
    position = dataset.getPos()
    pos,chromBounds = data_util.estCumPosSorted(position=position,offset=0)


    P_max = len(dataset.phenotype_ID)
    phenotype_ID = dataset.phenotype_ID[0:P_max]


    for p_ID in phenotype_ID[0:]:
        if p_ID == SigPhe:
            phenotype_vals, sample_idx = dataset.getPhenotypes([p_ID])
            phenotype_vals_ranks = preprocess.rankStandardizeNormal(phenotype_vals.values)
            lm_ranks = qtl.test_lm(snps=geno[sample_idx],pheno=phenotype_vals_ranks)
            pvalues_lm_ranks = pd.DataFrame(data=lm_ranks.pvalues.T,index=dataset.geno_ID,columns=[p_ID])
            manhattonPlot([p_ID], pvalues_lm_ranks, 'NMD-QTL-ManhattonPlot', pos, chromBounds)
def main():

    geno_file, pheno_file, out_dir = sys.argv[1:]

    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5'
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #out_dir = '.'
    logger = LoggerFactory.get_logger(os.path.join(out_dir,
                                                   'get_geno_pos.log'))
    LoggerFactory.log_command(logger, sys.argv[1:])

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    #snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Writing output to directory %s', out_dir)
    position = position.astype(int)
    chromBounds = chromBounds.astype(int)
    position.to_csv(os.path.join(out_dir, 'position.txt'),
                    header=True,
                    index=False,
                    sep='\t')
    np.savetxt(os.path.join(out_dir, 'chromBounds.txt'),
               chromBounds,
               delimiter=",")
import limix.io.genotype_reader as gr
import limix.io.phenotype_reader as phr 
import limix.io.data as data
import scipy as sp
import pylab as pl
import pandas as pd
import limix.io.data_util as data_util
from limix.utils.plot import *
import limix.utils.preprocess as preprocess
import limix.modules.qtl as qtl 
import limix.stats.fdr as fdr 
SIG = 0.0000001
FDR = 0.05


geno_reader = gr.genotype_reader_tables('Yeast-Genotype.hdf5')
pheno_reader = phr.pheno_reader_tables('Yeast-Phenotype.hdf5')
dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
geno = dataset.getGenotypes()
position = dataset.getPos()
pos,chromBounds = data_util.estCumPos(position=position,offset=0)
P_max = len(dataset.phenotype_ID)
phenotype_ID = dataset.phenotype_ID[0:P_max]
phenotype_vals, sample_idx = dataset.getPhenotypes(phenotype_ID)
N = geno.shape[0]
S = geno.shape[1]
P = phenotype_vals.shape[1]
sample_relatedness_unnormalized = dataset.getCovariance(normalize=False)
sample_relatedness  = sample_relatedness_unnormalized/ \
    sample_relatedness_unnormalized.diagonal().mean()
def main():

    geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #norm_mode = 'RIN'
    #out_dir = '.'
    #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5'
    #RNA_start,RNA_end = 0,2

    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading sample relatedness from %s',panama_file)
    panama_f = h5py.File(panama_file,'r')
    Ktot = panama_f['Ktot'][:]

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)

    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID)
    logger.info('Normalization: %s',norm_mode)
    if norm_mode=='None':
        phenotype_vals = phenotypes.values
    elif norm_mode=='RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode=='boxcox':
        phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Exit',norm_mode)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                   K=sample_relatedness,covs=cov,verbose=True)
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
                       K=Ktot,covs=None,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    #logger.info('Plotting LM vs LMM P-values')
    #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #    out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        # fig = plt.figure(figsize=[3,3])
        # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        # ymax = max(plt.xlim()[1],plt.ylim()[1])
        # plt.plot([0,ymax],[0,ymax],'k--')
        # plt.xlabel('LM')
        # plt.ylabel('LMM')
        # plt.title(p_ID)
        # fig.savefig(out_file)
        # plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
import limix.io.genotype_reader as gr
import limix.io.phenotype_reader as phr
import limix.io.data as data
import scipy as sp
import pylab as pl
import pandas as pd
import limix.io.data_util as data_util
from limix.utils.plot import *
import limix.utils.preprocess as preprocess
import limix.modules.qtl as qtl
import limix.stats.fdr as fdr
from PHENO import *
import gzip
import os

geno_reader = gr.genotype_reader_tables('../Yeast-Genotype-noMissing-SA.hdf5')
pheno_reader = phr.pheno_reader_tables('../Yeast-Phenotype-noOutlierSD.hdf5')
dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
geno = dataset.getGenotypes()
position = dataset.getPos()
pos,chromBounds = data_util.estCumPos(position=position,offset=0)
#pos,chromBounds = data_util.estCumPosSorted(position=position,offset=0,chroms=['II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI'])
pos,chromBounds = data_util.estCumPosSorted(position=position,offset=0)
P_max = len(dataset.phenotype_ID)
#P_max = 6
phenotype_ID = dataset.phenotype_ID[0:P_max:2]
phenotype_vals, sample_idx = dataset.getPhenotypes(phenotype_ID)
N = geno.shape[0]
S = geno.shape[1]
P = phenotype_vals.shape[1]
Exemple #9
0
from limix.utils.plot import *
# genotype summary stats
from limix.deprecated.stats.geno_summary import *
import os
import cPickle
import sys
import numpy as np
import pandas as pd

#--------------------#
#### Prepare data ####
#--------------------#

# Reader instance for genotypes
geno_reader = gr.genotype_reader_tables(
    '/home/hugot/projects/20150501_accessions/genotypes/snp250k/pygwas_genotypes_limix.hdf5'
)

# Reader instance for phenotypes
pheno_reader = phr.pheno_reader_tables(
    '/home/hugot/projects/20150501_accessions/phenotypes/limix/accession_phenotypes_silique_early.hdf5'
)

# Combine genotypes and phenotypes into limix-specific object
dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)

# Get SNPs, phenotypes and positions in respective variables
snps = dataset.getGenotypes()

phenotypes = dataset.getPhenotypes(intersection=True)[0]
def main():

    if 1:
        geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[
            1:]

    if 0:

        geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5'
        pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5'
        norm_mode = 'RIN'
        out_dir = 'test_v8'
        K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv'
        cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
        RNA_start, RNA_end = 0, 5

    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs'))
    logger = LoggerFactory.get_logger(os.path.join(
        log_dir, '%s-%s.log' % (RNA_start, RNA_end)),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])
    logger.info('Output directory: %s', out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results'))

    RNA_start, RNA_end = int(RNA_start), int(RNA_end)
    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes()  #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Sample relatedness %s', K_file)
    logger.info('Loading sample relatedness from %s', K_file)
    if (K_file == 'None'):
        sample_relatedness = None
    else:
        logger.info('Start loading covariance from %s', K_file)
        K_df = pd.read_csv(K_file, sep='\t', header=None,
                           index_col=0)  # accessions x accessions
        K_df.index = ['%d' % i for i in K_df.index]
        K_df.columns = K_df.index
        sample_relatedness = K_df.loc[dataset.sample_ID,
                                      dataset.sample_ID].as_matrix()
    sample_relatedness_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'sample_relatedness'))
    pl.imshow(sample_relatedness, aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png'))

    logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\
                                          sample_idx=dataset.sample_idx['pheno'])

    logger.info('Phenotype normalization: %s', norm_mode)
    if norm_mode == 'None':
        phenotype_vals = phenotypes.values
    elif norm_mode == 'RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode == 'boxcox':
        phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Use None',
                    norm_mode)
        phenotype_vals = phenotypes.values

    N = snps.shape[0]  #number of individuals
    S = snps.shape[1]  #number of SNPs
    P = phenotype_vals.shape[1]  #number of phenotypes
    logger.info(
        'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
        N, S, P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Sample covariance %s', cov_file)
    if (cov_file == 'None'):
        cov = None
    else:
        logger.info('Start loading covariance from %s', cov_file)
        cov_df = pd.read_csv(cov_file, sep='\t', header=0,
                             index_col=0)  # cov x accessions
        cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),
                       pheno=phenotype_vals,
                       K=sample_relatedness,
                       covs=cov,
                       verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,
                               index=dataset.geno_ID,
                               columns=phenotype_ID)

    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(
        os.path.join(out_results_dir, 'lmm_pval'))
    logger.info('Saving P-values to text file in %s', lmm_pval_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID),
                                 header=True,
                                 index=False)

    # Genome-wide manhatton plots for one phenotype:
    manh_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'manhattan'))
    logger.info('Plotting Manhattan plots in %s', manh_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[12, 8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],
                       pv=pvalues_lmm[p_ID].values,
                       chromBounds=chromBounds,
                       thr_plotting=0.05)
        plt.title('%s, LMM' % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # SNP vs. phenotype
    snp_pheno_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'snp_pheno'))
    logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\
                                          sample_idx=dataset.sample_idx['pheno'])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx, imax] == 0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx, imax] +
                 0.05 * np.random.randn(snps[s_idx, imax].shape[0]),
                 pheno_vals.values,
                 '.',
                 alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5, 2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    pval_hist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'pval_hist'))
    logger.info('Plotting P-value histograms to %s', pval_hist_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values, 20, normed=True)
        plt.plot([0, 1], [1, 1], "r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

# Quantile-Quantile plots
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,
                                                    'qqplot'))
    logger.info('Plotting Q-Q plots to %s', qqplot_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)

        fig.savefig(out_file)
        plt.close(fig)

    # P value scatter plot
    # logger.info('Plotting LM vs LMM P-values')
    # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #     out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
    #     fig = plt.figure(figsize=[3,3])
    #     plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
    #     ymax = max(plt.xlim()[1],plt.ylim()[1])
    #     plt.plot([0,ymax],[0,ymax],'k--')
    #     plt.xlabel('LM')
    #     plt.ylabel('LMM')
    #     plt.title(p_ID)
    #     fig.savefig(out_file)
    #     plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Exemple #11
0
    geno_dir = args.geno_dir
    pheno_file = args.pheno_file
    out_file = args.out_file
    cis_window_kb = args.cis_window_kb
    mt_file = args.mt_file

    return geno_dir, pheno_file, out_file, cis_window_kb, mt_file


if __name__ == '__main__':

    geno_dir, pheno_file, out_file, cis_window_kb, mt_file = get_args()

    cis_window = 1000 * cis_window_kb

    MT = gr.genotype_reader_tables(mt_file)
    Mpos = MT.getPos()
    Msnps = MT.getGenotypes()
    Msnps -= Msnps.mean(0)
    Msnps /= Msnps.std(0)
    M = sp.dot(Msnps, Msnps.T)
    Mids = MT.sample_ID

    # MTrare = gr.genotype_reader_tables(mt_rare_file)
    # MTrarepos = MTrare.getPos()
    # MTraresnps = MTrare.getGenotypes()
    # MTrareids=MTrare.sample_ID

    chr = 1
    chrgeno = gr.genotype_reader_tables(geno_dir + "/chrom" + str(chr) + ".h5")
    chrsnps = chrgeno.getGenotypes()
def main():

    geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5'
    #out_dir = '.'
    #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
    #RNA_start,RNA_end = 0,5
    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Calculating sample relatedness')
    # non-normalized and normalized sample relatedeness matrix
    sample_relatedness_unnormalized = dataset.getCovariance(normalize=False)
    sample_relatedness  = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean()
    sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness'))
    pl.imshow(sample_relatedness,aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png'))

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Start loading covariance from %s',cov_file)
    cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions
    cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()
    logger.info('Finished')

    logger.info('Start testing: LM')
    lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                     covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
                              columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                       K=sample_relatedness,covs=cov,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
                                header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        subpl = plt.subplot(2,1,1)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LM'%p_ID)
        subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        subpl = plt.subplot(1,2,1)
        plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")

        subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        subpl = plt.subplot(1,2,1)
        qqplot(pvalues_lm[p_ID].values)
        plt.title("%s, LM" % p_ID)
        subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    logger.info('Plotting LM vs LMM P-values')
    pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])
        plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        ymax = max(plt.xlim()[1],plt.ylim()[1])
        plt.plot([0,ymax],[0,ymax],'k--')
        plt.xlabel('LM')
        plt.ylabel('LMM')
        plt.title(p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Exemple #13
0
def main():

    geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1kgen_filter3.hdf5'
    pheno_file = '/gale/netapp/home/shhuang/data/1001_genomes/seed_size/accx_size.hdf5'
    expr_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_normCounts_k4_vst2T.hdf5'
    out_dir = '/gale/netapp/home/shhuang/projects/1001_genomes/draw_seedsize_plots_2016-11-30'
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    graphics_prefix = os.path.join(out_graphics_dir,
                                   'draw_seedsize_plots_2016-11-30-')
    results_prefix = os.path.join(out_dir, 'draw_seedsize_plots_2016-11-30-')
    logger = LoggerFactory.get_logger(os.path.join(
        out_dir, 'draw_seedsize_plots_2016-11-30.log'),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading expression from %s', expr_file)
    expr_reader = phr.pheno_reader_tables(expr_file)
    expr_reader.sample_ID = strip_xvec(expr_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    exprset = data.QTLData(geno_reader=geno_reader, pheno_reader=expr_reader)
    # import data
    #phenotypes,sample_idx = dataset.getPhenotypes(intersection=False)

    pheno_sample_select = np.ones(pheno_reader.sample_ID.shape[0], dtype=bool)
    phenotypes, pheno_sample_idx = pheno_reader.getPhenotypes(
        sample_idx=pheno_sample_select)
    expr_sample_select = np.ones(expr_reader.sample_ID.shape[0], dtype=bool)
    expr, expr_sample_idx = expr_reader.getPhenotypes(
        sample_idx=expr_sample_select)

    snps = geno_reader.getGenotypes()
    position = geno_reader.getPos()
    position, chromBounds = data_util.estCumPos(position=position, offset=0)
    gid_start, gid_end = geno_reader.getGenoIndex(chrom=4,
                                                  pos_start=(4, 13393142),
                                                  pos_end=(4, 13393144))
    gid_range = np.arange(gid_start, gid_end + 1)

    for ig, g_ID in enumerate(gid_range):

        g_ID = gid_range[ig:(ig + 1)]
        print(g_ID)
        gs_idx = dataset.sample_idx["geno"].values
        ps_idx = dataset.sample_idx["pheno"].values
        egs_idx = exprset.sample_idx["geno"].values
        eps_idx = exprset.sample_idx["pheno"].values

        snps_sub = snps[np.ix_(gs_idx, g_ID)][:, 0]
        phenotypes_sub = phenotypes.values[ps_idx]
        esnps_sub = snps[np.ix_(egs_idx, g_ID)][:, 0]
        fba5_sub = expr['AT4G26530'].values[eps_idx]
        position_sub = position.iloc[[g_ID[0]]]
        print(position_sub)

        point_file = graphics_prefix + 'point_chr%d_%d.png' % (
            position_sub['chrom'], position_sub['pos'])
        fig = plt.figure(figsize=[5, 2.5])  #create the figure
        plt.subplot(1, 2, 1)
        plt.plot(snps_sub + 0.05 * np.random.randn(snps_sub.shape[0]),
                 phenotypes_sub, '.')
        plt.xlabel("SNP")
        plt.ylabel("Seed size")
        plt.subplot(1, 2, 2)
        plt.plot(esnps_sub + 0.05 * np.random.randn(esnps_sub.shape[0]),
                 fba5_sub, '.')
        plt.xlabel("SNP")
        plt.ylabel("FBA5 expression")
        plt.tight_layout()
        fig.savefig(point_file)
        plt.close(fig)

        bxp_file = graphics_prefix + 'bxp_chr%d_%d.png' % (
            position_sub['chrom'], position_sub['pos'])
        fig = plt.figure(figsize=[5, 2.5])  #create the figure
        plt.subplot(1, 2, 1)
        phenotypes_box = [
            phenotypes_sub[snps_sub == 0], phenotypes_sub[snps_sub == 2]
        ]
        plt.boxplot(phenotypes_box)
        plt.xlabel("SNP")
        plt.ylabel("Seed size")
        plt.subplot(1, 2, 2)
        fba5_box = [fba5_sub[esnps_sub == 0], fba5_sub[esnps_sub == 2]]
        plt.boxplot(fba5_box)
        plt.xlabel("SNP")
        plt.ylabel("FBA5 expression")
        plt.tight_layout()
        fig.savefig(bxp_file)
        plt.close(fig)