def add_phenotype_file(self, ecotype_ids, phen_file_name=None, file_object=None, transformation='raw', transformation_description=None): """ Adds phenotype values, to an existing phenotype, e.g. when applying different transformations. """ retval = {'status':'OK', 'statustext':''} phed = pd.parse_phenotype_file(phen_file_name, file_object, with_db_ids=False) phed.filter_ecotypes_2(ecotype_ids) #phed.convert_to_averages() #self.h5file = self._open(mode="r+") growth_conditions = None phenotype_scoring = None method_description = None measurement_scale = None is_binary = None try: for pid in phed.phen_dict: phen_vals = phed.get_values(pid) ecotypes = phed.get_ecotypes(pid) phen_name = phed.get_name(pid) try: if self._check_phenotype_exists_(phen_name): raise Exception("Phenotype %s already exists, delete it first to upload it again.<br>" % phen_name) (bs_herits, bs_pids, bs_avg_herits, bs_herit_pvals) = phed.get_broad_sense_heritability() self._init_phenotype_(phen_name, num_vals=len(phen_vals), std_dev=sp.std(phen_vals), growth_conditions=growth_conditions, phenotype_scoring=phenotype_scoring, method_description=method_description, measurement_scale=measurement_scale, bs_herit_pvals=bs_herit_pvals, bs_avg_herits=bs_avg_herits, bs_herits=bs_herits, is_binary=is_binary) self._add_phenotype_values_(phen_name, ecotypes, phen_vals, transformation=transformation, transformation_description=transformation_description) except Exception, err: retval['status'] = "ERROR" retval['statustext'] += str(err) except Exception, err: raise(err)
def load_a_thaliana_phenotypes(): """ Loads A. thaliana phenotypes (Atwell et al., 2010) and returns a phenotype_data object containing 107 different phenotypes. """ import phenotypeData as pd phend = pd.parse_phenotype_file('at_data/199_phenotypes.csv') return phend
def run_gwas(pid, call_method_id, run_id, kinship_method, debug_filter=1): #import snpsdata #LOAD DATA sd = dp.load_snps_call_method(call_method_id) if debug_filter < 1: sd.sample_snps(debug_filter) phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv' phed = pd.parse_phenotype_file(phenotype_file) phed.convert_to_averages() phen_name = phed.get_name(pid) sd.coordinate_w_phenotype_data(phed, pid) phed.transform(pid, 'most_normal') phen_vals = phed.get_values(pid) if kinship_method == 'ibd': global_k = sd.get_ibd_kinship_matrix() elif kinship_method == 'ibs': global_k = sd.get_ibs_kinship_matrix() p_her = phed.get_pseudo_heritability(pid, global_k) hist_file = env.env['results_dir'] + '%s_%s_%d_%d_%s_hist.png' % \ (run_id, kinship_method, call_method_id, pid, phen_name) phed.plot_histogram(pid, p_her=p_her, png_file=hist_file) #Set up GWAS #Chromosomes. res_dict = lm.chrom_vs_rest_mm(phen_vals, sd, kinship_method, global_k) print res_dict file_prefix = env.env['results_dir'] + '%s_loc_v_glob_chrom_%s_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, pid, phen_name) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_2_(res_file_name, res_dict) #Now 'normal' window sizes for ws in [3000000, 1000000, 500000, 200000, 100000, 50000, 20000]: file_prefix = env.env['results_dir'] + '%s_loc_v_glob_%s_%d_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, ws, pid, phen_name) res_dict = lm.local_vs_global_mm_scan(phen_vals, sd, file_prefix, ws, ws / 2, kinship_method, global_k) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_(res_file_name, res_dict) #Now gene-centralized. for radius in [20000, 10000, 5000]: file_prefix = env.env['results_dir'] + '%s_loc_v_glob_gene_%s_%d_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, radius, pid, phen_name) res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method, global_k) res_file_name = file_prefix + '.csv' _write_res_dict_to_file_3_(res_file_name, res_dict) sd.filter_mac_snps(15) file_prefix = env.env['results_dir'] + '%s_emmax_stepwise_%s_%d_%d_%s' % \ (run_id, kinship_method, call_method_id, pid, phen_name) lm.emmax_step_wise(phen_vals, global_k, sd=sd, num_steps=10, file_prefix=file_prefix, save_pvals=True)
def _load_data_(self, delimiter=','): print "Loading phenotype and genotype data..." self.pd = pd.parse_phenotype_file(args.phenfile, delim=delimiter) (snpsds, chromosomes) = dp.parse_raw_snps_data( args.snpfile, target_format='binary', missing_val='?', debug_filter=0.01, return_chromosomes=True) # retain only 1% data for now self.sd = dp.SNPsDataSet(snpsds, chromosomes, data_format='binary')
def _run_(): p_dict, args = parse_parameters() print "GWA runs are being set up with the following parameters:" for k, v in p_dict.iteritems(): print k + ': ' + str(v) print '' #Load phenotype file if p_dict['phen_file']: print 'Loading phenotypes from file.' phed = phenotypeData.parse_phenotype_file(p_dict['phen_file'], with_db_ids=p_dict['with_db_ids']) #load phenotype file else: print 'Retrieving the phenotypes from the DB.' phed = phenotypeData.get_phenotypes_from_db(p_dict['pids']) if p_dict['pids']: updated_pids = list(set(p_dict['pids']).intersection(set(phed.get_pids()))) updated_pids.sort() p_dict['pids'] = updated_pids if not p_dict['pids']: #phenotype index arguement is missing, hence all phenotypes are run/analyzed. if not p_dict['phen_file']: raise Exception('Phenotype file or phenotype ID is missing.') p_dict['pids'] = phed.phen_dict.keys() #If on the cluster, then set up runs.. if p_dict['parallel']: if analysis_plots: #Running on the cluster.. for p_i in p_dict['pids']: run_parallel(p_i, phed, p_dict) else: for mapping_method in p_dict['specific_methods']: for trans_method in p_dict['specific_transformations']: for p_i in p_dict['pids']: # mh; previously: pids run_parallel(p_i, phed, p_dict, mapping_method, trans_method) return #Exiting the program... #Plot analysis plots... if p_dict['analysis_plots']: analysis_plots(phed, p_dict) else: #If not analysis plots... then GWAS for p_i in p_dict['pids']: if p_i in phed.phen_dict: print '-' * 120, '\n' phenotype_name = phed.get_name(p_i) print "Performing GWAS for phenotype: %s, phenotype_id: %s" % (phenotype_name, p_i) for trans_method in p_dict['specific_transformations']: print 'Phenotype transformation:', trans_method for mapping_method in p_dict['specific_methods']: #DO ANALYSIS print 'Mapping method:', mapping_method map_phenotype(p_i, phed, mapping_method, trans_method, p_dict)
def lotus_data_analysis(phenotype_id=1, result_files_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_results', manhattan_plot_file='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_manhattan.png', qq_plot_file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lmm_qq'): """ Lotus GWAS (data from Stig U Andersen) """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp import phenotypeData as pd # Load genotypes print 'Parsing genotypes' sd = dp.parse_snp_data( '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/20140603_NonRep.run2.vcf.matrix.ordered.csv') # Load phenotypes print 'Parsing phenotypes' phend = pd.parse_phenotype_file( '/Users/bjarnivilhjalmsson/Dropbox/Lotus_GWAS/141007_FT_portal_upd.csv') print 'Box-cox' phend.box_cox_transform(1) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. print 'Coordinating data' sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS/IBD) # print 'Calculating kinship' # K = kinship.calc_ibd_kinship(sd.get_snps()) # print K # Perform mixed model GWAS print 'Performing mixed model GWAS' # mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # mlmm_results = lm.mlmm(phend.get_values(phenotype_id), K, sd=sd, # num_steps=10, file_prefix=result_files_prefix, # save_pvals=True, pval_file_prefix=result_files_prefix) lg_results = lm.local_vs_global_mm_scan(phend.get_values(phenotype_id), sd, file_prefix='/Users/bjarnivilhjalmsson/Dropbox/Cloud_folder/tmp/lotus_FT_loc_glob_0.1Mb', window_size=100000, jump_size=50000, kinship_method='ibd', global_k=None) # # Construct a results object print 'Processing results'
def run(): phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv' run_id = sys.argv[1] kinship_method = sys.argv[2] call_method_id = int(sys.argv[3]) if len(sys.argv) < 5: print 'Setting up a cluster run' phed = pd.parse_phenotype_file(phenotype_file) pids = phed.phen_ids for pid in pids: run_parallel(pid, call_method_id, run_id, kinship_method) else: print 'Setting up a test run' pid = int(sys.argv[4]) run_gwas(pid, call_method_id, run_id, kinship_method)
def telomere_example_plots(debug_filter=1.0, pid=1365, call_method_id=78, radius=20000, kinship_method='ibs'): genes_of_interest = ['AT1G21390', 'AT1G21400', 'AT1G21410', 'AT1G21420', 'AT1G21430', 'AT1G21440', 'AT1G21450', 'AT1G21460', 'AT1G21470', 'AT1G21480', 'AT1G21490'] sd = dp.load_snps_call_method(call_method_id) if debug_filter < 1: sd.sample_snps(debug_filter) phenotype_file = env.env['phen_dir'] + 'phen_with_swedish_082211.csv' phed = pd.parse_phenotype_file(phenotype_file) phed.convert_to_averages() phen_name = phed.get_name(pid) sd.coordinate_w_phenotype_data(phed, pid) phed.transform(pid, 'most_normal') png_file = env.env['results_dir'] + 'histogram_%s_hist.png' % phed.get_name(pid) phed.plot_histogram(pid, png_file=png_file) phen_vals = phed.get_values(pid) file_prefix = env.env['results_dir'] + 'loc_v_glob_gene_%d_%d_%d_%s' % \ (call_method_id, radius, pid, phen_name) res_dict = lm.local_vs_global_gene_mm_scan(phen_vals, sd, file_prefix, radius, kinship_method, tair_ids=genes_of_interest, plot_gene_trees=True, ets=sd.accessions)
def run_parallel_rna_seq_gwas(): if len(sys.argv) > 4: run_id = sys.argv[5] call_method_id = int(sys.argv[4]) temperature = int(sys.argv[3]) phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature) file_prefix = env['results_dir'] + 'rna_seq_%s_%dC' % (run_id, temperature) run_gwas(file_prefix, phen_file, int(sys.argv[1]), int(sys.argv[2]), temperature, data_format='binary', call_method_id=call_method_id, near_const_filter=near_const_filter) else: call_method_id = int(sys.argv[3]) temperature = sys.argv[2] phen_file = '%s_%sC.csv' % (phen_file_prefix, temperature) phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) num_traits = phed.num_traits() print 'Found %d traits' % num_traits chunck_size = int(sys.argv[1]) for i in range(0, num_traits, chunck_size): run_parallel(i, i + chunck_size, temperature, call_method_id)
def lotus_mixed_model_gwas(phenotype_id=4, phen_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/20181113_136LjAccessionData.csv', gt_file = '/home/bjarni/LotusGenome/cks/Lotus31012019/all_chromosomes_binary.csv', pvalue_file='mm_results.pvals', manhattan_plot_file='mm_manhattan.png', qq_plot_file_prefix='mm_qq'): """ Perform mixed model (EMMAX) GWAS for Lotus data """ import linear_models as lm import kinship import gwaResults as gr import dataParsers as dp # Load genotypes sd = dp.parse_snp_data(gt_file) # Load phenotypes import phenotypeData as pd phend = pd.parse_phenotype_file(phen_file, with_db_ids=False) # Coordinate phenotype of interest and genotypes. This filters the genotypes and # phenotypes, leaving only accessions (individuals) which overlap between both, # and SNPs that are polymorphic in the resulting subset. sd.coordinate_w_phenotype_data(phend, phenotype_id) # Calculate kinship (IBS) K = kinship.calc_ibs_kinship(sd.get_snps()) # Perform mixed model GWAS mm_results = lm.emmax(sd.get_snps(), phend.get_values(phenotype_id), K) # Construct a results object res = gr.Result(scores=mm_results['ps'], snps_data=sd) # Save p-values to file res.write_to_file(pvalue_file) # Plot Manhattan plot res.plot_manhattan(png_file=manhattan_plot_file, percentile=90, plot_bonferroni=True, neg_log_transform=True) # Plot a QQ-plot res.plot_qq(qq_plot_file_prefix)
p_dict['data_description'] = arg elif opt in ("--transformation_description"): p_dict['transformation_description'] = arg elif opt in ("--biology_category_id"): p_dict['biology_category_id'] = arg else: print "Unkown option:", opt print __doc__ sys.exit(2) return p_dict, args if __name__ == '__main__': p_dict, args = parse_parameters() print p_dict, args phed = pd.parse_phenotype_file(p_dict['phen_filename'], with_db_ids=False) phed.convert_to_averages() pids = p_dict['pids'] if not pids: pids = phed.phen_ids mids = [] if p_dict['method_ids']: for pid, mid in zip(pids, p_dict['method_ids']): data_type = 'binary' if phed.is_binary(pid) else 'quantitative' new_mid = phed.insert_into_db([pid], p_dict['phenotype_scoring'], p_dict['method_description'], p_dict['growth_condition'], p_dict['biology_category_id'], p_dict['citations'], p_dict['data_description'],
def main(argv=None): '''Command line options.''' if argv is None: argv = sys.argv else: sys.argv.extend(argv) program_name = os.path.basename(sys.argv[0]) program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by Ümit Seren on %s. Copyright 2012 Gregor Mendel Institute. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) try: # Setup argument parser parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-l", "--list_genotypes", dest="list_genotypes", help="display available genotype dataset", action='store_true') parser.add_argument( "-t", "--transformation", dest="transformation", help="Apply a transformation to the data. Default[None]", choices=["log", "sqrt", "exp", "sqr", "arcsin_sqrt", "box_cox"]) parser.add_argument("-a", "--analysis_method", dest="analysis_method", help="analyis method to use", required=True, choices=[ "lm", "emma", "emmax", "kw", "ft", "emmax_anova", "lm_anova", "emmax_step", "lm_step", "loc_glob_mm", "amm" ]) parser.add_argument( "-g", "--genotype", dest="genotype", help= "genotype dataset to be used in the GWAS analysis (run with option -l to display list of available genotype datasets)", required=True, type=int, metavar="INTEGER") parser.add_argument( "-k", "--kinship", dest="kinship", help= "Specify the file containing the kinship matrix. (otherwise default file is used or it's generated.)", metavar="FILE") parser.add_argument( "-s", "--kinship_type", dest="kinship_type", help= "Type of kinship calculated. Possible types are ibs (default) or ibd ", choices=["ibs", "ibd"], default="ibs") parser.add_argument("-q", "--queue", dest="queue", help="Send status updates to Message Broker", action='store_true') parser.add_argument("-z", "--queue_host", dest="queue_host", help="Host of the Message Broker") parser.add_argument("-o", "--output_file", dest="outputfile", help="Name of the output file") parser.add_argument('-V', '--version', action='version', version=program_version_message) parser.add_argument(dest="file", help="csv file containing phenotype values", metavar="FILE") # Process arguments args = parser.parse_args() messenger = StdoutMessenger() if args.queue: messenger = ProgressMessenger(args.queue_host, 5672, 'admin', 'eastern') messenger.update_status(progress=0.0, task_status='Loading phenotype data') phenData = phenotypeData.parse_phenotype_file( args.file, False) #load phenotype file phen_ids = phenData.phen_dict.keys() # get phenotype ids #If not analysis plots... then GWAS for phen_id in phen_ids: phenotype_name = phenData.get_name(phen_id) messenger.update_status(progress=0.0, task_status='Loading phenotype data') print "Performing GWAS for phenotype: %s, phenotype_id: %s" % ( phenotype_name, phen_id) _perform_gwas_(phen_id, phenData, args.analysis_method, args.transformation, args.genotype, args.kinship_type, args.kinship, messenger, args.outputfile) return 0 except KeyboardInterrupt: ### handle keyboard interrupt ### return 0 except Exception, e: indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") sys.stderr.write(indent + " for help use --help") return 2
def load_and_plot_info_files(call_method_id=75, temperature=10, mac_threshold=15, debug_filter=1, near_const_filter=20, data_format='binary'): import random phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature) phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) print 'Loading the gene annotation dictionary' gene_dict = dp.parse_tair_gff_file() run_id = 'd081511' #run_id = 'rs_%d' % call_method_id file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id) num_genes = 0 radii = [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0] tss_dists = [200000, 100000, 50000, 25000, 10000, 5000, 1000] cvt_summary_dict = {'radius':{'avg_cis_trans_var_ratio':[0.0 for r in radii], 'avg_cis_herit':[0.0 for r in radii], 'avg_trans_herit':[0.0 for r in radii], 'counts':[0.0 for td in radii]}, 'radius_herit':{'avg_cis_trans_var_ratio':[0.0 for r in radii], 'avg_cis_herit':[0.0 for r in radii], 'avg_trans_herit':[0.0 for r in radii], 'counts':[0.0 for td in radii]}, 'tss_dist':{'avg_cis_trans_var_ratio':[0.0 for td in tss_dists], 'avg_cis_herit':[0.0 for td in tss_dists], 'avg_trans_herit':[0.0 for td in tss_dists], 'counts':[0.0 for td in tss_dists]}} heritabilities = [] transformations = [] shapiro_wilk_pvals = [] tair_ids = [] pval_infl_dict = {} dist_min_pval_dict = {} distance_bins = [(0, 5000), (0, 10000), (0, 25000), (0, 50000), (0, 100000), (1, -1), (6, -1)] radius_bins = [0, 1000, 5000, 10000, 25000, 50000, 100000] bonf_sign_bin_dict = {} res_dict = {} sign_count = {} for mm in ['EX', 'LM', 'KW']: pval_infl_dict[mm] = {'kolmogorov_smirnov':[], 'median_pvals':[]} dist_min_pval_dict[mm] = {} for bin in distance_bins: dist_min_pval_dict[mm][bin] = 0 bonf_sign_bin_dict[mm] = {} for bin in radius_bins: bonf_sign_bin_dict[mm][bin] = {'count':0.0, 'total':0.0} sign_count[mm] = 0 cofactor_count_dict = {} for criteria in ['ebics', 'mbonf', 'min_cof_ppa']: cofactor_count_dict[criteria] = {'num_cofactor_list':[], 'bin_counts':sp.zeros(9), 'num_cis_cofactor_list':[], 'num_found':0} pickle_file_dict = {} for mm in ['EX', 'LM', 'KW']: pickle_file_dict[mm] = {} pickle_file_dict[mm]['file_name'] = '%sresults_%s_mac%d.pickled' % (file_prefix, mm, mac_threshold) pickle_file_dict[mm]['res_dict'] = {} pids = phed.get_pids() for i, pid in enumerate(pids): tair_id = phed.get_name(pid) chrom = int(tair_id[2]) curr_file_prefix = '%schr_%d/rna_seq_%s_%dC_mac%d_pid%d_%s' % \ (file_prefix, chrom, run_id, temperature, mac_threshold, pid, tair_id) info_file_name = '%s_info.pickled' % curr_file_prefix for mm in ['EX', 'LM', 'KW']: res_dict[mm] = '%s_%s_.pvals' % (curr_file_prefix, mm) if random.random() > debug_filter: continue if os.path.isfile(info_file_name) and os.path.isfile(res_dict['EX'] + ".pickled") \ and os.path.isfile(res_dict['LM'] + ".pickled") and os.path.isfile(res_dict['KW'] + ".pickled"): print 'Loading info file: %s' % info_file_name num_genes += 1 info_dict = cPickle.load(open(info_file_name)) #Loading the info dict for mm in ['EX', 'LM', 'KW']: res_dict[mm] = gr.Result(res_dict[mm]) #Loading the result #Saving some basic statistics transformations.append(info_dict['transformation_type']) shapiro_wilk_pvals.append(info_dict['transformation_shapiro_pval']) heritabilities.append(info_dict['pseudo_heritability']) #cis vs. trans stuff cvt_dict = info_dict['CVT'] for r_i, r in enumerate(radii): if cvt_dict['radius'][r] != None: pvg = cvt_dict['radius'][r]['perc_var1'] pvl = cvt_dict['radius'][r]['perc_var2'] herit = cvt_dict['radius'][r]['pseudo_heritability1'] cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg) cvt_summary_dict['radius']['avg_cis_herit'][r_i] += pvl * herit cvt_summary_dict['radius']['avg_trans_herit'][r_i] += pvg * herit cvt_summary_dict['radius']['counts'][r_i] += 1.0 for r_i, r in enumerate(radii): if cvt_dict['radius'][r] != None: herit = cvt_dict['radius'][r]['pseudo_heritability1'] if herit > 0.05: pvg = cvt_dict['radius'][r]['perc_var1'] pvl = cvt_dict['radius'][r]['perc_var2'] cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] += pvl / (pvl + pvg) cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] += pvl * herit cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] += pvg * herit cvt_summary_dict['radius_herit']['counts'][r_i] += 1.0 for td_i, td in enumerate(tss_dists): if cvt_dict['tss_upstream'][td] != None: pvg = cvt_dict['tss_upstream'][td]['perc_var1'] pvl = cvt_dict['tss_upstream'][td]['perc_var2'] herit = cvt_dict['tss_upstream'][td]['pseudo_heritability1'] cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] += pvl / (pvl + pvg) cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] += pvl * herit cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] += pvg * herit cvt_summary_dict['tss_dist']['counts'][td_i] += 1.0 tair_ids.append(tair_id) for mm in ['EX', 'LM', 'KW']: pval_infl_dict[mm]['kolmogorov_smirnov'].append(info_dict[mm]['kolmogorov_smirnov']['D']) pval_infl_dict[mm]['median_pvals'].append(info_dict[mm]['pval_median']) dist_min_pval = tuple(info_dict[mm]['dist_to_min_pval']) if res_dict[mm].min_score() < 1 / (20.0 * res_dict[mm].num_scores()): sign_count[mm] += 1 for bin in distance_bins: if dist_min_pval <= bin: dist_min_pval_dict[mm][bin] += 1 break for bin in radius_bins: pval = info_dict[mm]['bin_dict'][bin]['min_pval'] num_snps = info_dict[mm]['bin_dict'][bin]['num_snps'] if num_snps > 0: bonf_sign_bin_dict[mm][bin]['total'] += 1 if pval < 1.0 / (20 * num_snps): bonf_sign_bin_dict[mm][bin]['count'] += 1 #Stepwise stuff for criteria in ['ebics', 'mbonf', 'min_cof_ppa']: num_cofactors = len(info_dict['SW'][criteria]['cofactors']) cofactor_count_dict[criteria]['num_cofactor_list'].append(num_cofactors) if num_cofactors > 0: cofactor_count_dict[criteria]['num_found'] += 1 cofactor_count_dict[criteria]['bin_counts'] += sp.array(info_dict['SW'][criteria]['bin_counts']) cofactor_count_dict[criteria]['num_cis_cofactor_list'].append(info_dict['SW'][criteria]['bin_counts'][2]) #Pre-process the results.. for mm in ['EX', 'LM', 'KW']: res = res_dict[mm] #Trim results res.neg_log_trans() if mm == 'EX': res.filter_attr('scores', 3) #Filter everything below 10^-2.5 else: res.filter_attr('scores', 4) #Filter everything below 10^-4 if res.num_scores() == 0: print "Skipping file since nothing is below 10^-5" continue gene_d = gene_dict[tair_id] avg_g_pos = (gene_d['start_pos'] + gene_d['end_pos']) / 2.0 chrom = int(gene_d['chromosome']) #Current gene chromosome #Prepare for plotting results.. x,y style, where gene is x, and y is p-values chrom_pos_score_dict = res.get_chrom_score_pos_dict() dist_dict = {} for score_threshold in [5, 6, 7]: #negative log10 thresholds. if len(res.snp_results['scores']) == 0: dist_dict[score_threshold] = -2 #No results else: res.filter_attr('scores', score_threshold) if len(res.snp_results['scores']) == 0: dist_dict[score_threshold] = -2 #No results else: cps_dict = res.get_chrom_score_pos_dict() pos_list = cps_dict[chrom]['positions'] if len(pos_list) > 0: distances = sp.absolute(sp.array(pos_list) - avg_g_pos) d_i = sp.argmin(distances) dist_dict[score_threshold] = distances[d_i] #Min distance. else: dist_dict[score_threshold] = -1 #Different chromosome pickle_file_dict[mm]['res_dict'][(chrom, avg_g_pos)] = {'tair_id':tair_id, 'chrom_pos_score':chrom_pos_score_dict, 'dist_dict':dist_dict, 'pid':pid} print dist_dict else: print "Didn't find file: %s or %s" % (info_file_name, res_dict['EX'] + ".pickled") for mm in ['EX', 'LM', 'KW']: cPickle.dump(pickle_file_dict[mm]['res_dict'], open(pickle_file_dict[mm]['file_name'], 'wb'), protocol=2) for r_i, r in enumerate(radii): r_counts = cvt_summary_dict['radius']['counts'][r_i] cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] = \ cvt_summary_dict['radius']['avg_cis_trans_var_ratio'][r_i] / r_counts cvt_summary_dict['radius']['avg_cis_herit'][r_i] = \ cvt_summary_dict['radius']['avg_cis_herit'][r_i] / r_counts cvt_summary_dict['radius']['avg_trans_herit'][r_i] = \ cvt_summary_dict['radius']['avg_trans_herit'][r_i] / r_counts for r_i, r in enumerate(radii): r_counts = cvt_summary_dict['radius_herit']['counts'][r_i] cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] = \ cvt_summary_dict['radius_herit']['avg_cis_trans_var_ratio'][r_i] / r_counts cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] = \ cvt_summary_dict['radius_herit']['avg_cis_herit'][r_i] / r_counts cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] = \ cvt_summary_dict['radius_herit']['avg_trans_herit'][r_i] / r_counts for td_i, td in enumerate(tss_dists): td_counts = cvt_summary_dict['tss_dist']['counts'][td_i] cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] = \ cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio'][td_i] / td_counts cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] = \ cvt_summary_dict['tss_dist']['avg_cis_herit'][td_i] / td_counts cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] = \ cvt_summary_dict['tss_dist']['avg_trans_herit'][td_i] / td_counts results_prefix = env['results_dir'] + 'RNAseq_summary_%dC_cm%d' % (temperature, call_method_id) pylab.figure() pylab.plot(cvt_summary_dict['radius']['avg_cis_trans_var_ratio']) pylab.ylabel('Avg. perc. of cis genetic var.') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_rad.png') pylab.clf() pylab.figure() pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_trans_var_ratio']) pylab.ylabel('Avg. perc. of cis genetic var.') pylab.xlabel('Dist. upstream from gene TSS (kb)') pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) pylab.savefig(results_prefix + '_avg_perc_cis_gen_var_td.png') pylab.clf() # pylab.figure() # pylab.plot(cvt_summary_dict['tss_dist']['avg_cis_herit']) # pylab.ylabel('Avg. cis heritability') # pylab.xlabel('Dist. upstream from gene TSS (kb)') # pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) # pylab.savefig(results_prefix + 'avg_cis_herit_td.png') # pylab.clf() # # # pylab.figure() # pylab.plot(cvt_summary_dict['tss_dist']['avg_trans_herit']) # pylab.ylabel('Avg. remaining heritability') # pylab.xlabel('Dist. upstream from gene TSS (kb)') # pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) # pylab.savefig(results_prefix + 'avg_trans_herit_td.png') # pylab.clf() # pylab.figure() # pylab.plot(cvt_summary_dict['radius']['avg_trans_herit']) # pylab.ylabel('Avg. remaining heritability') # pylab.xlabel('Dist. from gene (kb)') # pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) # pylab.savefig(results_prefix + 'avg_trans_herit_rad.png') # pylab.clf() # # pylab.figure() # pylab.plot(cvt_summary_dict['radius']['avg_cis_herit']) # pylab.ylabel('Avg. cis heritability') # pylab.xlabel('Dist. from gene (kb)') # pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) # pylab.savefig(results_prefix + 'avg_cis_herit_rad.png') # pylab.clf() tot_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['radius']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['radius']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 7, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_rad.png') tot_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['radius_herit']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['radius_herit']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 7], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(8), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(8), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. from gene (kb)') pylab.xticks(range(8), [500, 100, 50, 25, 10, 5, 1, 0]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 7, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_2_rad.png') tot_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit']) + \ sp.array(cvt_summary_dict['tss_dist']['avg_trans_herit']) cis_herit = sp.array(cvt_summary_dict['tss_dist']['avg_cis_herit']) pylab.figure(figsize=(10, 6)) pylab.axes([0.06, 0.08, 0.92, 0.90]) pylab.fill_between([0, 6], 0, 1, color='#DD3333', alpha=0.8, label='Error') pylab.fill_between(sp.arange(7), 0, tot_herit, color='#22CC44', alpha=0.8, label='Heritable variance') pylab.fill_between(sp.arange(7), 0, cis_herit, color='#2255AA', \ alpha=0.8, label='Heritable variance (cis)') pylab.ylabel('Average partition of variance') pylab.xlabel('Dist. upstream from gene TSS (kb)') pylab.xticks(range(7), [200, 100, 50, 25, 10, 5, 1]) pylab.legend(loc=1, ncol=3, shadow=True) pylab.axis([0, 6, 0, 1]) pylab.savefig(results_prefix + 'avg_herit_td.png') pylab.figure() pylab.hist(heritabilities, bins=20, alpha=0.7) pylab.xlabel('Pseudo-heritability') pylab.xlim((-0.025, 1.025)) pylab.savefig(results_prefix + '_herits_hist.png') pylab.clf() ks_list = [] pm_list = [] for mm in ['EX', 'LM', 'KW']: ks_list.append(pval_infl_dict[mm]['kolmogorov_smirnov']) pm_list.append(pval_infl_dict[mm]['median_pvals']) png_file_name = results_prefix + '_kolmogorov_smirnov_boxplot.png' pylab.figure() pylab.boxplot(ks_list) pylab.axhline(0, color='k', alpha=0.6, ls='-.') pylab.xticks(range(1, 4), ['EX', 'LM', 'KW']) pylab.ylabel('Kolmogorov-Smirnov statistic D.') pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + '_median_pvals_boxplot.png' pylab.figure() pylab.boxplot(pm_list) pylab.axhline(0, color='k', alpha=0.6, ls='-.') pylab.xticks(range(1, 4), ['EX', 'LM', 'KW']) pylab.ylabel('Median p-value bias') pylab.savefig(png_file_name) pylab.clf() x_positions = sp.arange(len(distance_bins), dtype='d64') width = 0.25 png_file_name = results_prefix + '_dist_min_pval_hist.png' pylab.axes([0.08, 0.2, 0.91, 0.75]) for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']): l = [dist_min_pval_dict[mm][bin] for bin in distance_bins] tot_sum = sum(l) l = map(lambda x: x / float(tot_sum), l) pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm) x_positions += width pylab.ylabel('Frequency') pylab.xticks(x_positions - 3 * width / 2.0, (r'$d \leq 5$', r'$5< d \leq 10$', r'$10< d \leq 25$', \ r'$25< d \leq 50$', r'$50< d \leq 100$', r'$d>100$', \ 'Other chrom.'), rotation='45') pylab.xlabel('Distance $d$ (kb) to the smallest p-value from the gene.') pylab.xlim((-0.25, len(distance_bins))) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf() x_positions = sp.arange(len(radius_bins) + 1, dtype='d64') width = 0.25 png_file_name = results_prefix + 'bonf_sign_bin_hist.png' pylab.axes([0.08, 0.22, 0.91, 0.73]) for mm, color in zip(['EX', 'LM', 'KW'], ['b', 'c', 'g']): l = [bonf_sign_bin_dict[mm][bin]['count'] / bonf_sign_bin_dict[mm][bin]['total'] for bin in radius_bins] l.append(sign_count[mm] / float(num_genes)) pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=mm) x_positions += width pylab.ylabel('Fraction of sign. results') pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$d \leq 1$', r'$d \leq 5$', \ r'$d \leq 10$', r'$d \leq 25$', r'$d \leq 50$', \ r'$d \leq 100$', 'Whole genome'), rotation='45') pylab.xlabel(r'Among SNPs with distance $d$ (kb) from gene.') pylab.xlim((-0.25, len(radius_bins) + 1)) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cofactor_count_hist.png' x_positions = sp.arange(6, dtype='d64') width = 0.25 for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cofactor_list'])) while len(bin_counts) < 6: bin_counts.append(0) pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.xlabel('Number of cofactor SNPs') pylab.ylabel('Number of genes') pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5')) pylab.legend(loc=1) pylab.xlim((-0.2, 6)) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cis_cofactor_count_hist.png' x_positions = sp.arange(6, dtype='d64') for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): bin_counts = list(sp.bincount(cofactor_count_dict[criteria]['num_cis_cofactor_list'])) while len(bin_counts) < 6: bin_counts.append(0) pylab.bar(x_positions, bin_counts, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.xlabel('Number of cis cofactor SNPs') pylab.ylabel('Number of genes') pylab.xticks(x_positions - 3 * width / 2.0, ('0', '1', '2', '3', '4', '5')) pylab.legend(loc=1) pylab.xlim((-0.2, 6)) pylab.savefig(png_file_name) pylab.clf() png_file_name = results_prefix + 'cofactor_bin_count_hist.png' x_positions = sp.arange(9, dtype='d64') width = 0.25 pylab.axes([0.08, 0.2, 0.91, 0.75]) for criteria, color in zip(['ebics', 'mbonf', 'min_cof_ppa'], ['b', 'c', 'g']): cofactor_count_dict[criteria]['bin_counts'] = \ cofactor_count_dict[criteria]['bin_counts'] / cofactor_count_dict[criteria]['num_found'] l = list(cofactor_count_dict[criteria]['bin_counts']) l.reverse() pylab.bar(x_positions, l, width, color=color, alpha=0.7, label=criteria) x_positions += width pylab.ylabel('Fraction all genes with cofactors.') pylab.xlabel(r'Distance $d$ (kb) to cofactor from gene.') pylab.xticks(x_positions - 3 * width / 2.0, ('Within gene', r'$1\geq d$', r'$5\geq d$', r'$10\geq d$', \ r'$25\geq d$', r'$50\geq d$', r'$100\geq d$', \ r'$d>100$', 'Other chrom.'), rotation='45') pylab.xlim((-0.2, 9)) pylab.legend(loc=2) pylab.savefig(png_file_name) pylab.clf()
def plot(temperature=10, call_method_id=75, mapping_method='EX', mac_threshold=15, min_score=5, near_const_filter=20, data_format='binary', plot_data=True): #Load in chromosome dict.. #file_prefix = '/srv/lab/data/rna_seq_062911/%dC/cm_%d/' % (temperature, call_method_id) file_prefix = '/srv/lab/data/rna_seq_083011/%dC/cm_%d/' % (temperature, call_method_id) results_dict_file = '%sresults_%s_mac%d.pickled' % (file_prefix, mapping_method, mac_threshold) res_dict = cPickle.load(open(results_dict_file)) phen_file = '%s_%dC.csv' % (phen_file_prefix, temperature) phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=0.01) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) chrom_dict = {} for x_chrom in [1, 2, 3, 4, 5]: for y_chrom in [1, 2, 3, 4, 5]: chrom_dict[(x_chrom, y_chrom)] = {'scores':[], 'x_positions':[], 'y_positions':[], 'tair_ids':[], 'r2':[], 'mac':[]} scores = [] for x_chrom, x_pos in res_dict: d = res_dict[(x_chrom, x_pos)] tair_id = d['tair_id'] for y_chrom in [1, 2, 3, 4, 5]: cps_d = d['chrom_pos_score'][y_chrom] for i in range(len(cps_d['scores'])): s = cps_d['scores'][i] if s > min_score: if s > 25: s = 25 scores.append(s) chrom_dict[(x_chrom, y_chrom)]['scores'].append(s) chrom_dict[(x_chrom, y_chrom)]['tair_ids'].append(tair_id) chrom_dict[(x_chrom, y_chrom)]['x_positions'].append(x_pos) chrom_dict[(x_chrom, y_chrom)]['y_positions'].append(cps_d['positions'][i]) #Write chrom_dict to file.. if not plot_data: for x_chrom in [1, 2, 3, 4, 5]: for y_chrom in [1, 2, 3, 4, 5]: file_name = file_prefix + 'result_plots/pvalues_chrom%d_chrom%d_%s_min%d.txt' % (x_chrom, y_chrom, mapping_method, min_score) print 'Writing to file:', file_name with open(file_name, 'w') as f: d = chrom_dict[(x_chrom, y_chrom)] f.write('x_position, y_position, score, tair_id\n') l = zip(d['x_positions'], d['y_positions'], d['scores'], d['tair_ids']) l.sort() for t in l: f.write('%d,%d,%f,%s\n' % t) chrom_sizes = [30425061, 19694800, 23456476, 18578714, 26974904] cum_chrom_sizes = [sum(chrom_sizes[:i]) for i in range(5)] tot_num_bases = float(sum(chrom_sizes)) rel_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), chrom_sizes) rel_cum_chrom_sizes = map(lambda x: 0.925 * (x / tot_num_bases), cum_chrom_sizes) for i in range(5): rel_cum_chrom_sizes[i] = rel_cum_chrom_sizes[i] + 0.02 + 0.01 * i chromosome_ends = {1:30.425061, 2:19.694800, 3:23.456476, 4:18.578714, 5:26.974904} print rel_chrom_sizes, rel_cum_chrom_sizes #Filter data.. #Now plot data!! if plot_data: alpha = 0.8 linewidths = 0 vmin = min_score f = pylab.figure(figsize=(40, 35)) chromosomes = [1, 2, 3, 4, 5] plot_file_name = file_prefix + 'result_plots/pvalues_%s_min%d.png' % (mapping_method, min_score) label = '$-log_{10}$(p-value)' vmax = max(scores) for yi, chr2 in enumerate(chromosomes): for xi, chr1 in enumerate(chromosomes): l = chrom_dict[(chr1, chr2)]['scores'] if len(l) == 0: continue ax = f.add_axes([0.96 * (rel_cum_chrom_sizes[xi] + 0.01), rel_cum_chrom_sizes[yi] - 0.02, 0.96 * (rel_chrom_sizes[xi]), rel_chrom_sizes[yi] ]) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) #ax.tick_params(fontsize='x-large') if xi > 0: ax.spines['left'].set_visible(False) ax.yaxis.set_visible(False) else: ax.yaxis.set_ticks_position('left') ax.set_ylabel('Chromosome %d (Mb)' % chr2, fontsize='x-large') if yi < 4: ax.spines['top'].set_visible(False) ax.xaxis.set_visible(False) else: ax.xaxis.set_ticks_position('top') ax.xaxis.set_label_position('top') ax.set_xlabel('Chromosome %d (Mb)' % chr1, fontsize='x-large') #ax.set_xlabel('Chromosome %d' % chr1) #l = -sp.log10(l) #l = l.tolist() l_zxy = zip(l, chrom_dict[(chr1, chr2)]['x_positions'], chrom_dict[(chr1, chr2)]['y_positions']) l_zxy.sort() l = map(list, zip(*l_zxy)) zs = l[0] xs = map(lambda x: x / 1000000.0, l[1]) ys = map(lambda x: x / 1000000.0, l[2]) scatter_plot = ax.scatter(xs, ys, c=zs, alpha=alpha, linewidths=linewidths, vmin=vmin, vmax=vmax) ax.axis([-0.025 * chromosome_ends[chr1], 1.025 * chromosome_ends[chr1], - 0.025 * chromosome_ends[chr2], 1.025 * chromosome_ends[chr2]]) cax = f.add_axes([0.965, 0.7, 0.01, 0.2]) cb = pylab.colorbar(scatter_plot, cax=cax) cb.set_label(label, fontsize='xx-large') #cb.set_tick_params(fontsize='x-large') f.text(0.005, 0.47, 'Associated SNP position', size='xx-large', rotation='vertical') f.text(0.47, 0.988, 'Expressed gene position', size='xx-large') print 'Saving figure:', plot_file_name f.savefig(plot_file_name, format='png')
def run_gwas(file_prefix, phen_file, start_i, stop_i, temperature, mac_threshold=15, filter_threshold=0.02, call_method_id=79, data_format='diploid_int', debug_filter=1.0, near_const_filter=20): """ GWAS """ phed = pd.parse_phenotype_file(phen_file, with_db_ids=False) #load phenotype file phed.filter_near_const_phens(near_const_filter) phed.convert_to_averages() num_traits = phed.num_traits() pids = phed.phen_ids[start_i :stop_i] sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format=data_format, debug_filter=debug_filter) indices_to_keep = sd.coordinate_w_phenotype_data(phed, 1, coord_phen=False) #All phenotypes are ordered the same way, so we pick the first one. phed.filter_ecotypes(indices_to_keep, pids=pids) print len(sd.accessions) K = sd.get_ibs_kinship_matrix() #K = dp.load_kinship(call_method_id=call_method_id, data_format=data_format, sd=sd, method='ibs') sd.filter_mac_snps(mac_threshold) snps = sd.getSnps() positions = sd.getPositions() chromosomes = sd.get_chr_list() r = sd.get_mafs() macs = r['mafs'] mafs = r['marfs'] print 'In total there are %d SNPs to be mapped.' % len(snps) gene_dict = dp.parse_tair_gff_file()#_load_genes_list_('rna_seq_031311_%sC' % temperature) for i, pid in enumerate(pids): if not pid in phed.phen_ids: continue gene_tair_id = phed.get_name(pid) # exons = [] # for isoform in d: # for exon in isoform['exons']: # exons.append((d['chromosome'], exon['start_pos'], exon['end_pos'])) d = gene_dict[gene_tair_id] gene_strand = d['strand'] try: chrom = int(d['chromosome']) except Exception: raise gene = gwaResults.Gene(chromosome=int(d['chromosome']), startPos=d['start_pos'], endPos=d['end_pos'], name=gene_tair_id, description=None, dbRef=gene_tair_id, tairID=gene_tair_id) print i, pid, gene curr_file_prefix = '%s_mac%d_pid%d_%s' % (file_prefix, mac_threshold, pid, gene_tair_id) trans_type, shapiro_pval = phed.most_normal_transformation(pid) print 'Most normal transformation was: %s' % trans_type #trans_type = 'None' summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':shapiro_pval} #summary_dict = {'transformation_type':trans_type, 'transformation_shapiro_pval':0} print'Applying Kruskal-Wallis' phen_vals = phed.get_values(pid) res = util.kruskal_wallis(snps, phen_vals) pvals = res['ps'].tolist() kw_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes) print 'Summarizing KW' summary_dict['KW'] = kw_res.get_gene_analysis(gene) summary_dict['KW']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['KW']['pval_median'] = agr.calc_median(res['ps']) print 'Applying LM' res = lm.linear_model(snps, phen_vals) pvals = res['ps'].tolist() perc_var_expl = res['var_perc'].tolist() lm_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) print 'Summarizing LM' summary_dict['LM'] = lm_res.get_gene_analysis(gene) summary_dict['LM']['kolmogorov_smirnov'] = agr.calc_ks_stats(res['ps']) summary_dict['LM']['pval_median'] = agr.calc_median(res['ps']) print 'Applying EX Stepwise' snp_priors = sd.get_cand_genes_snp_priors([gene]) ex_sw_res = lm.emmax_step_wise(phen_vals, K, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, snps=snps, num_steps=5, cand_gene_list=[gene], with_qq_plots=False, log_qq_max_val=6.0, save_pvals=True, snp_priors=snp_priors) print 'Summarizing the step-wise mixed model' pvals = ex_sw_res['first_emmax_res']['ps'].tolist() perc_var_expl = ex_sw_res['first_emmax_res']['var_perc'].tolist() ex_res = gr.Result(scores=pvals, macs=macs, mafs=mafs, positions=positions, chromosomes=chromosomes, perc_var_expl=perc_var_expl) summary_dict['EX'] = ex_res.get_gene_analysis(gene) summary_dict['pseudo_heritability'] = ex_sw_res['step_info_list'][0]['pseudo_heritability'] summary_dict['EX']['kolmogorov_smirnov'] = agr.calc_ks_stats(ex_sw_res['first_emmax_res']['ps']) summary_dict['EX']['pval_median'] = agr.calc_median(ex_sw_res['first_emmax_res']['ps']) #Does the linear mixed model fit the data better? summary_dict['MM_LRT'] = lm.mm_lrt_test(phen_vals, K) #FINISH summarizing the stepwise!!! summarize_stepwise(summary_dict, gene, ex_sw_res['step_info_list'], ex_sw_res['opt_dict']) cvt_dict = {'radius':{}, 'tss_upstream':{}} print 'Comparing cis vs. trans kinship' #Check 1 mb, 200kb, 100kb, 50kb, 20kb, 10kb, 2kb, 0kb for radius in [500000, 100000, 50000, 25000, 10000, 5000, 1000, 0]: print radius r_start_pos = max(gene.startPos - radius, 0) r_end_pos = gene.endPos + radius d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['radius'][radius] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['radius'][radius] = None print cvt_dict['radius'][radius] #Check TSS, 100kb, 50kb,25kb, 10kb,5kb,0kb, (all upstream) for dist in [200000, 100000, 50000, 25000, 10000, 5000, 1000]: print dist, gene_strand if gene_strand == '+': r_start_pos = max(gene.startPos - dist, 0) r_end_pos = gene.startPos else: r_start_pos = gene.endPos r_end_pos = gene.endPos + dist d = sd.get_region_split_kinships([(chrom, r_start_pos, r_end_pos)], kinship_method='ibs', global_kinship=K) reg_k = d['regional_k'] glob_k = d['global_k'] if reg_k != None: cvt_dict['tss_upstream'][dist] = lm.local_vs_global_mm(phen_vals, reg_k, glob_k, K) else: cvt_dict['tss_upstream'][dist] = None print cvt_dict['tss_upstream'][dist] summary_dict['CVT'] = cvt_dict #Write info to file.. cPickle.dump(summary_dict, open(curr_file_prefix + '_info.pickled', 'w'), protocol=2) f_prefix = curr_file_prefix + '_hist' phed.plot_histogram(pid, title='Gene expressions for %s' % gene_tair_id, png_file=f_prefix + '.png', p_her=summary_dict['pseudo_heritability'], x_label='RNA seq expression levels (%s transformed)' % trans_type) #Plot GWAs... for res, method_name in [(kw_res, 'KW'), (lm_res, 'LM'), (ex_res, 'EX')]: res.filter_percentile(filter_threshold, reversed=True) res.write_to_file('%s_%s_.pvals' % (curr_file_prefix, method_name), only_pickled=True) if ex_res.min_score() < 10e-10: #print [cg.tairID for cg in cgs] f_prefix = '%s_%s_manhattan' % (curr_file_prefix, method_name) res.plot_manhattan(png_file=f_prefix + '.png', percentile=0, cand_genes=[gene], plot_bonferroni=True, neg_log_transform=True)
def _run_otu_wperm(self, file_prefix, phenotype_file, delimiter=',', covariate_file=None, phenotype_id=1, call_method_id=1307, maf_threshold=5, number_of_permutations=10): ## # phenotype_file = "/home/GMI/matt.horton/meta/metagenomics/gwas/leaf/16S/min800_cca/phenotypes/leaf.16S.800.2sampPerOTU.rare.cca.abd.2reps.n100.cca.txt" # call_method_id = 1308 # maf_threshold = 5 # phenotype_id = 1 # delimiter = ',' print "Opening snp and phenotype files." sys.stdout.flush() if '/' in phenotype_file: print "Opening phenotype-file: " + phenotype_file phenotype = pd.parse_phenotype_file( phenotype_file, delim=delimiter) #load phenotype file results_directory = phenotype_file.partition( "phenotypes" ) # parse this off of the phenotypeFileName and sub the phenotypes dir for the results dir (which needs to be at the same level!!!) results_directory = results_directory[0] + 'results/' print "Outputing results to: " + results_directory else: phenotype = pd.parse_phenotype_file( env['phen_dir'] + phenotype_file, delim=delimiter) #load phenotype file results_directory = env['results_dir'] sd = dp.load_snps_call_method(call_method_id=call_method_id, data_format='binary') indices_to_keep = sd.coordinate_w_phenotype_data( phenotype, phenotype_id) #truncate to the phenotype of interest. indices_to_keep = indices_to_keep.get('pd_indices_to_keep') # determine whether to use mac or maf (I might have to use the mac code after determining what the mac should be from the maf) if maf_threshold > 0: sd.filter_mac_snps(10) # mac_threshold = int(math.ceil(len(sd.accessions) * (float(maf_threshold) / 100))) # print "Applying maf threshold: " + str(maf_threshold) + "% to " + str(len(sd.accessions)) + " accessions (mac < " + str(mac_threshold) + ")" # sd.filter_mac_snps(mac_threshold) phenotype_name = phenotype.get_name(phenotype_id) phenotype_values = phenotype.get_values(phenotype_id) Z = phenotype.get_incidence_matrix(phenotype_id) print "There are: " + str(sd.num_snps()) + " SNPs." print "in: " + str(len(sd.accessions)) + " accessions" print "and " + str(len(indices_to_keep)) + " observations." print "The average number of observations per genotype is " + str( float(len(indices_to_keep)) / float(len(sd.accessions))) sys.stdout.flush() K = sd.get_ibs_kinship_matrix() K = sp.matrix(K) Z = sp.matrix(Z) print "Examining phenotype: '" + phenotype_name + "' (phenotype_id: " + str( phenotype_id) + ")." print 'Applying Permutation tests.' snps = sd.get_snps() print "Running %d EMMAX-permutations (writes %d dots)" % ( number_of_permutations, number_of_permutations) s1 = time.time() res_perm = self._emmax_permutations(snps, phenotype_values, number_of_permutations, K=K, Z=Z) p_f_list = zip(res_perm['min_ps'], res_perm['max_f_stats']) p_f_list.sort() print p_f_list[:10] threshold = p_f_list[len(p_f_list) / 20] res_perm['threshold_05'] = threshold print 'Threshold should be:', threshold secs = time.time() - s1 if secs > 60: mins = int(secs) / 60 secs = secs - mins * 60 print 'Took %d mins and %f seconds.' % (mins, secs) else: print 'Took %f seconds.' % (secs) print "Permutation tests done for phenotype: " + phenotype_name results = {} results['perm_pval'] = res_perm['min_ps'].tolist() results['perm_fstat'] = res_perm['max_f_stats'].tolist() output_file = '%s/%s_perm.pvals_pid_%d_%s' % ( results_directory, file_prefix, phenotype_id, phenotype_name) columns = ['perm_pval', 'perm_fstat'] with open(output_file, "w") as f: f.write(','.join(columns) + "\n") for i in range(1, (number_of_permutations + 1)): l = [results[c][i - 1] for c in columns] l = map(str, l) f.write(",".join(l) + "\n") print "Permutation p-values written."
def main(argv=None): '''Command line options.''' if argv is None: argv = sys.argv else: sys.argv.extend(argv) program_name = os.path.basename(sys.argv[0]) program_version = "v%s" % __version__ program_build_date = str(__updated__) program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date) program_shortdesc = __import__('__main__').__doc__.split("\n")[1] program_license = '''%s Created by Ümit Seren on %s. Copyright 2012 Gregor Mendel Institute. All rights reserved. Licensed under the Apache License 2.0 http://www.apache.org/licenses/LICENSE-2.0 Distributed on an "AS IS" basis without warranties or conditions of any kind, either express or implied. USAGE ''' % (program_shortdesc, str(__date__)) try: # Setup argument parser parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter) parser.add_argument("-l", "--list_genotypes", dest="list_genotypes", help="display available genotype dataset", action='store_true') parser.add_argument("-t", "--transformation", dest="transformation", help="Apply a transformation to the data. Default[None]", choices=["log", "sqrt", "exp", "sqr", "arcsin_sqrt", "box_cox"]) parser.add_argument("-a", "--analysis_method", dest="analysis_method", help="analyis method to use",required=True,choices=["lm", "emma", "emmax", "kw", "ft", "emmax_anova", "lm_anova", "emmax_step", "lm_step","loc_glob_mm","amm"]) parser.add_argument("-g", "--genotype", dest="genotype", help="genotype dataset to be used in the GWAS analysis (run with option -l to display list of available genotype datasets)", required=True, type=int,metavar="INTEGER" ) parser.add_argument("-k", "--kinship", dest="kinship", help="Specify the file containing the kinship matrix. (otherwise default file is used or it's generated.)", metavar="FILE" ) parser.add_argument("-s", "--kinship_type", dest="kinship_type", help="Type of kinship calculated. Possible types are ibs (default) or ibd ", choices=["ibs", "ibd"],default="ibs") parser.add_argument("-q", "--queue", dest="queue", help="Send status updates to Message Broker", action='store_true') parser.add_argument("-z", "--queue_host", dest="queue_host", help="Host of the Message Broker") parser.add_argument("-o", "--output_file", dest="outputfile", help="Name of the output file") parser.add_argument('-V', '--version', action='version', version=program_version_message) parser.add_argument(dest="file", help="csv file containing phenotype values", metavar="FILE") # Process arguments args = parser.parse_args() messenger = StdoutMessenger() if args.queue: messenger = ProgressMessenger(args.queue_host,5672,'admin','eastern') messenger.update_status(progress=0.0, task_status='Loading phenotype data') phenData = phenotypeData.parse_phenotype_file(args.file,False) #load phenotype file phen_ids = phenData.phen_dict.keys() # get phenotype ids #If not analysis plots... then GWAS for phen_id in phen_ids: phenotype_name = phenData.get_name(phen_id) messenger.update_status(progress=0.0, task_status='Loading phenotype data') print "Performing GWAS for phenotype: %s, phenotype_id: %s" % (phenotype_name, phen_id) _perform_gwas_(phen_id, phenData, args.analysis_method, args.transformation,args.genotype,args.kinship_type,args.kinship,messenger,args.outputfile) return 0 except KeyboardInterrupt: ### handle keyboard interrupt ### return 0 except Exception, e: indent = len(program_name) * " " sys.stderr.write(program_name + ": " + repr(e) + "\n") sys.stderr.write(indent + " for help use --help") return 2