Ejemplo n.º 1
0
def main():

    geno_in, norm_cov, cov_out_hdf5, cov_out_csv = sys.argv[1:]

    logger = LoggerFactory.get_logger(cov_out_hdf5 + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    ## Import genotype data
    logger.info('Loading genotype from %s', geno_in)
    geno_reader = gr.genotype_reader_tables(geno_in)
    if (norm_cov == '1'):
        logger.info('Normalizing')
        norm = True
    else:
        logger.info('NOT normalizing')
        norm = False

    sample_relatedness = geno_reader.getCovariance(normalize=norm)

    logger.info('Saving covariance to HDF5 file %s', cov_out_hdf5)
    out_dict = {'Cov': sample_relatedness}
    o = h5py.File(cov_out_hdf5, 'w')
    util_functions.smartDumpDictHdf5(out_dict, o)
    o.close()

    logger.info('Saving covariance to CSV file %s', cov_out_csv)
    save_cov_in_text_format(cov_out_csv, sample_relatedness,
                            geno_reader.sample_ID)

    logger.info('Done!')
Ejemplo n.º 2
0
def main():

    geno_in, scale_kinship, kinship_out_hdf5, kinship_out_csv = sys.argv[1:]

    logger = LoggerFactory.get_logger(kinship_out_hdf5 + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    ## Import genotype data
    geno = genotype.load_hdf5_genotype_data(geno_in)
    SNP_acc = geno.accessions
    logger.info('Finished reading SNP from %s', geno_in)

    logger.info('Start calculating kinship')
    K = geno.get_ibs_kinship_matrix()
    if (scale_kinship == '1'):
        logger.info('Scaling')
        K = kinship.scale_k(K)
    else:
        logger.info('NOT scaling')

    logger.info('Saving kinship to HDF5 file %s', kinship_out_hdf5)
    kinship.save_kinship_to_file(kinship_out_hdf5, K, geno.accessions,
                                 geno.num_snps)
    logger.info('Saving kinship to CSV file %s', kinship_out_csv)
    save_kinship_in_text_format(kinship_out_csv, K, geno.accessions)

    logger.info('Done!')
def main():

    usage = "usage: %prog [options]"
    parser = OptionParser(usage=usage)
    parser.add_option("-O",
                      "--outfile",
                      action="store",
                      dest='outfile',
                      type=str,
                      help='The output hdf5 file wiht the resulting data',
                      default="example_out")
    parser.add_option("-R",
                      "--dmr",
                      action="store",
                      dest='dmr',
                      help="Read DMR tsv file (filename)",
                      default=None)
    parser.add_option("-S",
                      "--dms",
                      action="store",
                      dest='dms',
                      help="Read methylation site tsv file (filename)",
                      default=None)

    (options, args) = parser.parse_args()
    logger = LoggerFactory.get_logger(options.outfile + '.log')
    LoggerFactory.log_command(logger, sys.argv)

    hdf = h5py.File(options.outfile)
    if options.dmr is not None:
        logger.info('Converting DMR tsv %s and write to %s', options.dmr,
                    options.outfile)
        convert_dmr_tsv(hdf,
                        options.dmr,
                        chrom=None,
                        start=None,
                        end=None,
                        sample_subset=None)
    if options.dms is not None:
        logger.info('Converting DMS tsv %s and write to %s', options.dms,
                    options.outfile)
        convert_dms_tsv(hdf,
                        options.dms,
                        chrom=None,
                        start=None,
                        end=None,
                        sample_subset=None)

    logger.info('Done!')
Ejemplo n.º 4
0
def main():

    geno_in, acc_in, maf_lb, maf_ub, geno_out = sys.argv[1:]

    logger = LoggerFactory.get_logger(geno_out + '.log')
    LoggerFactory.log_command(logger, sys.argv[1:])

    maf_lb, maf_ub = float(maf_lb), float(maf_ub)

    ## Import genotype data
    geno = genotype.load_hdf5_genotype_data(geno_in)
    SNP_acc = geno.accessions
    logger.info('Finished reading SNP from %s', geno_in)

    ## accession subset
    with open(acc_in, 'rb') as f:
        reader = csv.reader(f)
        file_acc = list(reader)
    logger.info('Finished reading accession subset from %s', acc_in)

    ## get common accessions in the same order for genotype and accession subset
    acc_common = [acc for acc in SNP_acc if acc in file_acc]

    ## filtering
    logger.info(
        'Start subsetting accessions and filtering SNPs by MAF >%f and <=%f',
        maf_lb, maf_ub)
    match = lambda a, b: [b.index(x) if x in b else None for x in a]
    geno.filter_accessions_ix(match(acc_common, SNP_acc))
    (num_snps, num_removed) = filter_maf_snps(geno, maf_lb, maf_ub)
    logger.info('Removed %d from %d SNPs', num_removed, num_snps)
    logger.info('Number of SNPs remaining %d', geno.num_snps)

    logger.info('Start writing filtered genotype file to %s', geno_out)
    geno.save_as_hdf5(geno_out)
    logger.info('Finished')

    logger.info('Done!')
Ejemplo n.º 5
0
def main():

    geno_file, pheno_file, out_dir = sys.argv[1:]

    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5'
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #out_dir = '.'
    logger = LoggerFactory.get_logger(os.path.join(out_dir,
                                                   'get_geno_pos.log'))
    LoggerFactory.log_command(logger, sys.argv[1:])

    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    #snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Writing output to directory %s', out_dir)
    position = position.astype(int)
    chromBounds = chromBounds.astype(int)
    position.to_csv(os.path.join(out_dir, 'position.txt'),
                    header=True,
                    index=False,
                    sep='\t')
    np.savetxt(os.path.join(out_dir, 'chromBounds.txt'),
               chromBounds,
               delimiter=",")
def main():

    geno_file,pheno_file,norm_mode,panama_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-filtered01_1001g_vst_cv0p05T.hdf5'
    #norm_mode = 'RIN'
    #out_dir = '.'
    #panama_file = '/gale/netapp/home/shhuang/projects/1001_genomes/calc_k_panama_2016-02-03/calc_k_panama_2016-02-03-_K10_dat.hdf5'
    #RNA_start,RNA_end = 0,2

    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)
    logger.info('Loading sample relatedness from %s',panama_file)
    panama_f = h5py.File(panama_file,'r')
    Ktot = panama_f['Ktot'][:]

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)

    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = dataset.getPhenotypes(phenotype_ID)
    logger.info('Normalization: %s',norm_mode)
    if norm_mode=='None':
        phenotype_vals = phenotypes.values
    elif norm_mode=='RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode=='boxcox':
        phenotype_vals,maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Exit',norm_mode)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    #lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
    #                   K=sample_relatedness,covs=cov,verbose=True)
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
                       K=Ktot,covs=None,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    #logger.info('Plotting LM vs LMM P-values')
    #pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    #for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #    out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        # fig = plt.figure(figsize=[3,3])
        # plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        # ymax = max(plt.xlim()[1],plt.ylim()[1])
        # plt.plot([0,ymax],[0,ymax],'k--')
        # plt.xlabel('LM')
        # plt.ylabel('LMM')
        # plt.title(p_ID)
        # fig.savefig(out_file)
        # plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Ejemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser(
        description='Modified genome FASTA file by 5mC')
    parser.add_argument('genome')
    parser.add_argument('allc_h5')
    parser.add_argument('genome_mod')
    parser.add_argument('--zero',
                        help='whether the allc table is 0-index',
                        action='store_true')
    args = parser.parse_args()
    genome, allc_h5, genome_mod = args.genome, args.allc_h5, args.genome_mod
    offset = 0 if args.zero else 1  # allc table positions are 1-based

    logger = LoggerFactory.get_logger(genome_mod + '.log')
    LoggerFactory.log_command(logger, sys.argv)
    logger.info('Getting unmodified genome FASTA file from %s', genome)

    coord_pat = re.compile('^(chr)?(.+)$')
    fa_list = list(SeqIO.parse(genome, "fasta"))
    fa_dict = dict((r.id, r.seq) for r in fa_list)
    fa_key_by_chrom = dict()
    for k in fa_dict.keys():  # '1' -> 'chr1' or '1' -> '1'
        coord_mat = coord_pat.match(k)
        chrom = coord_mat.group(2)
        fa_key_by_chrom[chrom] = k

    logger.info('Chromosomes: %s', fa_key_by_chrom.keys())
    logger.info('Modifying FASTA by all_c %s', allc_h5)
    fa_dict_out = dict()
    for chrom, fa_key in fa_key_by_chrom.items():
        logger.info('Reading allc table for chromosome %s from %s', chrom,
                    allc_h5)
        hdf = read_hdf(allc_h5,
                       'allc_' + str(chrom),
                       where=['mcall==1'],
                       columns=['pos', 'strand', 'context'])
        mc_pos, mc_strand, mc_type = hdf['pos'], hdf['strand'], hdf['context']
        logger.info('Number of methylated C: %d', len(mc_pos))

        seq = fa_dict[fa_key].tomutable()
        for p, s, t in zip(mc_pos, mc_strand, mc_type):
            p0 = p - offset
            if (p0 > len(seq)):
                logger.info('Index out of bound for %s: mc_pos %d, len %d',
                            fa_key, p0, len(seq))
                continue
            if s == '+':
                if (seq[p0] == 'C' or seq[p0] == 'c'):
                    seq[p0] = 'm'
                else:
                    logger.warn(
                        'Sequence %s: expected to see C/c (%s:%s) at %d, but saw %s',
                        fa_key, t, s, p0, seq[p0])
            elif s == '-':
                if (seq[p0] == 'G' or seq[p0] == 'g'):
                    seq[p0] = '1'
                else:
                    logger.warn(
                        'Sequence %s: expected to see G/g (%s:%s) at %d, but saw %s',
                        fa_key, t, s, p0, seq[p0])
            else:
                logger.warn('Sequence %s: unrecognized strand %s at %d',
                            fa_key, s, p0)
        fa_dict_out[fa_key] = seq

    for r in fa_list:
        r.seq = fa_dict_out[r.id]
    with open(genome_mod, 'w') as ofa:
        SeqIO.write(fa_list, ofa, 'fasta')

    logger.info('Done!')
def main():

    if 1:
        geno_file, pheno_file, norm_mode, K_file, cov_file, RNA_start, RNA_end, out_dir = sys.argv[
            1:]

    if 0:

        geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/dmC_bins/dmC_filtered/dmC_filtered_methylation_4.hdf5'
        pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-02-06/ath1001_tx_norm_2016-02-06-UQ_gNorm_k4_vst2_cv0p05_UQCounts_1001gT.hdf5'
        norm_mode = 'RIN'
        out_dir = 'test_v8'
        K_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/X1001tx_filter1/norm_cov_1001tx_filter1.csv'
        cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
        RNA_start, RNA_end = 0, 5

    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir, 'logs'))
    logger = LoggerFactory.get_logger(os.path.join(
        log_dir, '%s-%s.log' % (RNA_start, RNA_end)),
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])
    logger.info('Output directory: %s', out_dir)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir, 'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir, 'results'))

    RNA_start, RNA_end = int(RNA_start), int(RNA_end)
    logger.info('Loading genotype from %s', geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s', pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader, pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes()  #SNPS
    position = dataset.getPos()
    position, chromBounds = data_util.estCumPos(position=position,
                                                offset=100000)

    logger.info('Sample relatedness %s', K_file)
    logger.info('Loading sample relatedness from %s', K_file)
    if (K_file == 'None'):
        sample_relatedness = None
    else:
        logger.info('Start loading covariance from %s', K_file)
        K_df = pd.read_csv(K_file, sep='\t', header=None,
                           index_col=0)  # accessions x accessions
        K_df.index = ['%d' % i for i in K_df.index]
        K_df.columns = K_df.index
        sample_relatedness = K_df.loc[dataset.sample_ID,
                                      dataset.sample_ID].as_matrix()
    sample_relatedness_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'sample_relatedness'))
    pl.imshow(sample_relatedness, aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir, 'sample_relatedness.png'))

    logger.info('Subset phenotype to index %d-%d', RNA_start, RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotypes,sample_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=phenotype_ID,\
                                          sample_idx=dataset.sample_idx['pheno'])

    logger.info('Phenotype normalization: %s', norm_mode)
    if norm_mode == 'None':
        phenotype_vals = phenotypes.values
    elif norm_mode == 'RIN':
        phenotype_vals = preprocess.rankStandardizeNormal(phenotypes.values)
    elif norm_mode == 'boxcox':
        phenotype_vals, maxlog = preprocess.boxcox(phenotypes.values)
    else:
        logger.info('Normalization mode %s is not recognized.  Use None',
                    norm_mode)
        phenotype_vals = phenotypes.values

    N = snps.shape[0]  #number of individuals
    S = snps.shape[1]  #number of SNPs
    P = phenotype_vals.shape[1]  #number of phenotypes
    logger.info(
        'Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
        N, S, P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        plot_normal(phenotype_vals[:, ip], alpha=0.8, figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Sample covariance %s', cov_file)
    if (cov_file == 'None'):
        cov = None
    else:
        logger.info('Start loading covariance from %s', cov_file)
        cov_df = pd.read_csv(cov_file, sep='\t', header=0,
                             index_col=0)  # cov x accessions
        cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()

    #logger.info('Start testing: LM')
    #lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals,
    #                 covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    #pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
    #                          columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),
                       pheno=phenotype_vals,
                       K=sample_relatedness,
                       covs=cov,
                       verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,
                               index=dataset.geno_ID,
                               columns=phenotype_ID)

    #lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(
        os.path.join(out_results_dir, 'lmm_pval'))
    logger.info('Saving P-values to text file in %s', lmm_pval_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        #pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
        #                        header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir, '%s.txt' % p_ID),
                                 header=True,
                                 index=False)

    # Genome-wide manhatton plots for one phenotype:
    manh_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'manhattan'))
    logger.info('Plotting Manhattan plots in %s', manh_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[12, 8])
        #subpl = plt.subplot(2,1,1)
        #plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        #plt.title('%s, LM'%p_ID)
        #subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],
                       pv=pvalues_lmm[p_ID].values,
                       chromBounds=chromBounds,
                       thr_plotting=0.05)
        plt.title('%s, LMM' % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # SNP vs. phenotype
    snp_pheno_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'snp_pheno'))
    logger.info('Plotting phenotype vs. SNP to %s', snp_pheno_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[3, 3])  #create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = getPhenotypes(dataset.pheno_reader,phenotype_IDs=[p_ID],\
                                          sample_idx=dataset.sample_idx['pheno'])
        imax = lmm.pvalues[ip].argmin()
        i_0 = snps[s_idx, imax] == 0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx, imax] +
                 0.05 * np.random.randn(snps[s_idx, imax].shape[0]),
                 pheno_vals.values,
                 '.',
                 alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5, 2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    pval_hist_dir = make_sure_path_exists(
        os.path.join(out_graphics_dir, 'pval_hist'))
    logger.info('Plotting P-value histograms to %s', pval_hist_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        #plt.plot([0,1],[1,1],"r")
        #plt.title("%s, LM" % p_ID)
        #plt.xlabel("P-value")
        #plt.ylabel("Frequency")

        #subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values, 20, normed=True)
        plt.plot([0, 1], [1, 1], "r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

# Quantile-Quantile plots
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,
                                                    'qqplot'))
    logger.info('Plotting Q-Q plots to %s', qqplot_dir)
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir, '%s.png' % p_ID)
        fig = plt.figure(figsize=[7, 3])

        #subpl = plt.subplot(1,2,1)
        #qqplot(pvalues_lm[p_ID].values)
        #plt.title("%s, LM" % p_ID)
        #subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)

        fig.savefig(out_file)
        plt.close(fig)

    # P value scatter plot
    # logger.info('Plotting LM vs LMM P-values')
    # pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    # for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
    #     out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
    #     fig = plt.figure(figsize=[3,3])
    #     plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
    #     ymax = max(plt.xlim()[1],plt.ylim()[1])
    #     plt.plot([0,ymax],[0,ymax],'k--')
    #     plt.xlabel('LM')
    #     plt.ylabel('LMM')
    #     plt.title(p_ID)
    #     fig.savefig(out_file)
    #     plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
def main():

    geno_file,pheno_file,cov_file,RNA_start,RNA_end,out_dir = sys.argv[1:]
    make_sure_path_exists(out_dir)
    log_dir = make_sure_path_exists(os.path.join(out_dir,'logs'))
    logger = LoggerFactory.get_logger(os.path.join(log_dir,'%s-%s.log'%(RNA_start,RNA_end)),
                                      file_level=logging.DEBUG,console_level=logging.DEBUG)
    LoggerFactory.log_command(logger,sys.argv[1:])
    logger.info('Output directory: %s',out_dir)
    
    #geno_file = '/gale/netapp/home/shhuang/data/1001_genomes/gmi_release_v3.1/1001genomes_snp-short-indel_only_ACGTN_1001tx_filter1_2.hdf5' 
    #pheno_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_normCounts_k4_1001g_vst2_cv0p05_rinT.hdf5'
    #out_dir = '.'
    #cov_file = '/gale/netapp/home/shhuang/projects/1001_genomes/ath1001_tx_norm_2016-01-03/ath1001_tx_norm_2016-01-03-gNorm_W_k4.txt'
    #RNA_start,RNA_end = 0,5
    RNA_start,RNA_end = int(RNA_start),int(RNA_end)
    out_graphics_dir = make_sure_path_exists(os.path.join(out_dir,'graphics'))
    out_results_dir = make_sure_path_exists(os.path.join(out_dir,'results'))
 
    logger.info('Loading genotype from %s',geno_file)
    geno_reader = gr.genotype_reader_tables(geno_file)
    logger.info('Loading phenotype from %s',pheno_file)
    pheno_reader = phr.pheno_reader_tables(pheno_file)
    pheno_reader.sample_ID = strip_xvec(pheno_reader.sample_ID)

    # the data object allows to query specific genotype or phenotype data
    logger.info('Creating QTL dataset')
    dataset = data.QTLData(geno_reader=geno_reader,pheno_reader=pheno_reader)
    # getting genotypes
    snps = dataset.getGenotypes() #SNPS
    position = dataset.getPos()
    position,chromBounds = data_util.estCumPos(position=position,offset=100000)

    logger.info('Calculating sample relatedness')
    # non-normalized and normalized sample relatedeness matrix
    sample_relatedness_unnormalized = dataset.getCovariance(normalize=False)
    sample_relatedness  = sample_relatedness_unnormalized/sample_relatedness_unnormalized.diagonal().mean()
    sample_relatedness_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'sample_relatedness'))
    pl.imshow(sample_relatedness,aspect='auto')
    plt.savefig(os.path.join(sample_relatedness_dir,'sample_relatedness_norm.png'))

    logger.info('Subset phenotype to index %d-%d',RNA_start,RNA_end)
    phenotype_ID = dataset.phenotype_ID[RNA_start:RNA_end]
    phenotype_vals,sample_idx = dataset.getPhenotypes(phenotype_ID)

    N = snps.shape[0] #number of individuals
    S = snps.shape[1] #number of SNPs
    P = phenotype_vals.shape[1]#number of phenotypes
    logger.info('Number of individuals: %d; number of SNPs: %d; number of phenotypes: %d',
                N,S,P)

    logger.info('Plotting phenotype histograms')
    phenohist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'phenohist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(phenohist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure
        
        plot_normal(phenotype_vals.values[:,ip],alpha=0.8,figure=fig)
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Start loading covariance from %s',cov_file)
    cov_df = pd.read_csv(cov_file,sep='\t',header=0,index_col=0) # cov x accessions
    cov = cov_df.ix[add_xvec(dataset.sample_ID)].as_matrix()
    logger.info('Finished')

    logger.info('Start testing: LM')
    lm = qtl.test_lm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                     covs=cov,verbose=True)
    #convert P-values to a DataFrame for nice output writing:
    pvalues_lm = pd.DataFrame(data=lm.pvalues.T,index=dataset.geno_ID,
                              columns=phenotype_ID)
    logger.info('Start testing: LMM')
    lmm = qtl.test_lmm(snps=snps[sample_idx].astype('int'),pheno=phenotype_vals.values,
                       K=sample_relatedness,covs=cov,verbose=True)
    pvalues_lmm = pd.DataFrame(data=lmm.pvalues.T,index=dataset.geno_ID,
                               columns=phenotype_ID)

    logger.info('Saving P-values to text file')
    lm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lm_pval'))
    lmm_pval_dir = make_sure_path_exists(os.path.join(out_results_dir,'lmm_pval'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        pvalues_lm[p_ID].to_csv(os.path.join(lm_pval_dir,'%s.txt'%p_ID),
                                header=True,index=False)
        pvalues_lmm[p_ID].to_csv(os.path.join(lmm_pval_dir,'%s.txt'%p_ID),
                                 header=True,index=False)

    # Genome-wide manhatton plots for one phenotype:
    logger.info('Plotting Manhattan plots')
    manh_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'manhattan'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(manh_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[12,8])
        subpl = plt.subplot(2,1,1)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LM'%p_ID)
        subpl = plt.subplot(2,1,2)
        plot_manhattan(posCum=position['pos_cum'],pv=pvalues_lmm[p_ID].values,chromBounds=chromBounds,thr_plotting=0.05)
        plt.title('%s, LMM'%p_ID)
        fig.savefig(out_file)
        plt.close(fig)
        
    # SNP vs. phenotype
    logger.info('Plotting phenotype vs. SNP')
    snp_pheno_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'snp_pheno'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(snp_pheno_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])#create the figure

        #find maximum squared beta value
        pheno_vals, s_idx = dataset.getPhenotypes([p_ID])
        imax = lm.pvalues[ip].argmin()
        i_0 = snps[s_idx,imax]==0
        #plot SNP vs. phenotype for max beta
        plt.plot(snps[s_idx,imax]+0.05*np.random.randn(snps[s_idx,imax].shape[0]),pheno_vals.values,'.',alpha=0.5)
        plt.xlabel("SNP")
        plt.ylabel("phenotype")
        plt.xlim([-0.5,2.5])
        plt.title("%s" % p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    # P-value histgrams
    logger.info('Plotting P-value histograms')
    pval_hist_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_hist'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_hist_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
        
        subpl = plt.subplot(1,2,1)
        plt.hist(pvalues_lm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")

        subpl = plt.subplot(1,2,2)
        plt.hist(pvalues_lmm[p_ID].values,20,normed=True)
        plt.plot([0,1],[1,1],"r")
        plt.title("%s, LMM" % p_ID)
        plt.xlabel("P-value")
        plt.ylabel("Frequency")
        fig.savefig(out_file)
        plt.close(fig)

   # Quantile-Quantile plots
    logger.info('Plotting Q-Q plots')
    qqplot_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'qqplot'))
    for ip, p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(qqplot_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[7,3])
       
        subpl = plt.subplot(1,2,1)
        qqplot(pvalues_lm[p_ID].values)
        plt.title("%s, LM" % p_ID)
        subpl = plt.subplot(1,2,2)
        qqplot(pvalues_lmm[p_ID].values)
        plt.title("%s, LMM" % p_ID)
       
        fig.savefig(out_file)
        plt.close(fig)
       
    # P value scatter plot
    logger.info('Plotting LM vs LMM P-values')
    pval_lmvslmm_dir = make_sure_path_exists(os.path.join(out_graphics_dir,'pval_lmvslmm'))
    for ip,p_ID in enumerate(dataset.phenotype_ID[RNA_start:RNA_end]):
        out_file = os.path.join(pval_lmvslmm_dir,'%s.png'%p_ID)
        fig = plt.figure(figsize=[3,3])
        plt.plot(-sp.log10(pvalues_lm[p_ID]),-sp.log10(pvalues_lmm[p_ID]),'.')
        ymax = max(plt.xlim()[1],plt.ylim()[1])
        plt.plot([0,ymax],[0,ymax],'k--')
        plt.xlabel('LM')
        plt.ylabel('LMM')
        plt.title(p_ID)
        fig.savefig(out_file)
        plt.close(fig)

    logger.info('Done with all plots!')

    logger.info('Done!')
Ejemplo n.º 10
0
def main():

    geno_hdf5, kinship_hdf5, RNA_csv, COV_file, RNA_start, RNA_end, out_file = sys.argv[
        1:]
    RNA_start, RNA_end = int(RNA_start), int(RNA_end)

    logger = LoggerFactory.get_logger(out_file + '.log',
                                      file_level=logging.DEBUG,
                                      console_level=logging.DEBUG)
    LoggerFactory.log_command(logger, sys.argv[1:])

    step_list = [""]

    ## Import genotype data
    logger.info('Start reading SNP from %s', geno_hdf5)
    geno = genotype.load_hdf5_genotype_data(geno_hdf5)
    SNP_accx = ['X' + acc for acc in geno.accessions]
    logger.info('Finished reading SNP from %s', geno_hdf5)

    ## Import phenotype data
    logger.info('Finished reading RNA from %s', RNA_csv)
    RNA_df = pd.read_csv(RNA_csv, sep='\t', header=0,
                         index_col=0)  # genes x accessions
    RNA_accx = list(RNA_df.columns.values)
    RNA_genes = list(RNA_df.index)
    logger.info('Finished reading RNA from %s', RNA_csv)

    ## get common accessions in the same order for genotype, phenotype and phenotype covariates
    logger.info('Consolidate accessions from genotype and RNA file')
    accx_common = [accx for accx in SNP_accx if accx in RNA_accx]
    RNA = RNA_df.as_matrix(columns=accx_common).T  # accession x genes
    match = lambda a, b: [b.index(x) if x in b else None for x in a]
    geno.filter_accessions_ix(match(accx_common, SNP_accx))
    logger.info(
        'Number of accessions: genotype file %d, RNA file %d, common %d',
        len(SNP_accx), len(RNA_accx), len(accx_common))

    logger.info('Start building SNP matrix in memory')
    snps = np.vstack(geno.get_snps_iterator(is_chunked=True))
    snps = snps.T.astype(int)
    logger.info('Finished')

    logger.info('Start loading kinship matrix from %s', kinship_hdf5)
    load_k = kinship.load_kinship_from_file(kinship_hdf5, scaled=False)
    K0 = load_k['k'].astype(float)
    K_accx = ['X' + acc for acc in load_k['accessions']]
    K_accx_ix = np.ix_(match(accx_common, K_accx), match(accx_common, K_accx))
    K = K0[K_accx_ix]
    logger.info('Finished')

    logger.info('Start loading covariance from %s', COV_file)
    COV_df = pd.read_csv(COV_file, sep='\t', header=0,
                         index_col=0)  # cov x accessions
    COV = COV_df.ix[accx_common].as_matrix()
    logger.info('Finished')

    logger.info('Start association testing: RNA start %d, RNA end %d',
                RNA_start, RNA_end)

    run_lmm_chunk(snps, RNA, COV, RNA_start, RNA_end,
                  list(RNA_genes[RNA_start:RNA_end]), K, out_file)