def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=["cases_explained", "probability_phenotype"], help="Which results to report, either the proportion " "of cases explained or the probability of the " "phenotype given the number of alleles carried") parser.add_option("--map-file", dest="map_file", type="string", help="plink .map file with SNP positions") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink ped file with phenotype and " "genotype data - A2 major allele coded") parser.add_option("--gwas-file", dest="gwas", type="string", help="gwas results file, assumes Plink " "output format. Must contain SNP, BP, " "OR column headers. Assumes results relate " "to the A1 allele") parser.add_option("--flip-alleles", dest="flip", action="store_true", help="force alleles to flip if OR < 1") parser.add_option("--plot-statistic", dest="plot_stat", type="choice", choices=["frequency", "cumulative"], help="plot either cases frequency or cumulative " "frequency of cases") parser.add_option("--plot-path", dest="plot_path", type="string", help="save path for plot") parser.add_option("--flag-explained-recessive", dest="explained", action="store_true", help="flag individuals explained by carriage of " "2 risk alleles - NOT IMPLIMENTED") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) # required files are .ped file, .map file and gwas results file E.info("reading GWAS results file: %s" % options.gwas) snp_df = pd.read_table(options.gwas, sep="\t", header=0, index_col=None) snp_list = snp_df["SNP"].values # parse ped file E.info("Reading ped file: %s" % options.ped_file) ped_df = gwas.parsePed(options.ped_file, compound_geno=True) # parse map file and get SNP indices that correspond to # ped file genotypes E.info("Fetching SNPs from map file: %s" % options.map_file) snp_index = gwas.getSNPs(options.map_file, snp_list) E.info("SNPs found: %i" % len(snp_index)) # extract SNPs and ORs as key, value pairs or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list') snp_or = dict(zip(or_dict["SNP"], or_dict["OR"])) if options.flip: E.info("Flipping major alleles to risk alleles") flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index, snp_results=snp_or, genos=ped_df["GENOS"].tolist()) # merge flipped genotypes with pedigree frame to get phenotypes geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"]) else: # split genos into a dataframe genos = np.array(ped_df["GENOS"].tolist()) geno_df = pd.DataFrame(genos, index=ped_df["FID"]) merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID") # need to discount missing genotypes > 1% # frequencies of number of risk alleles by trait frequency E.info("count #risk alleles per individual") risk_results = gwas.countRiskAlleles(ped_frame=merged, snp_index=snp_index.values(), report=options.method, flag=options.explained) risk_freqs = risk_results["freqs"] cumulative = risk_results["cumulative"] # select results upto and including cumulative freq = 1.0 max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0] max_freqs = risk_freqs[:max_indx + 1] max_cum = cumulative[:max_indx + 1] bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1] # plot! if options.plot_stat == "frequency": E.info("Generating plot of #risk alleles vs. P(Phenotype)") hist_df = gwas.plotRiskFrequency(bins=bins, frequencies=max_freqs, savepath=options.plot_path, ytitle="P(Phenotype)") elif options.plot_stat == "cumulative": E.info("Generating plot of #risk alleles vs. cumulative frequency") hist_df = gwas.plotRiskFrequency(bins=bins, frequencies=max_cum, savepath=options.plot_path, ytitle="Cumulative frequency cases") hist_df["freq"] = risk_results["freqs"][:max_indx + 1] hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1] hist_df["cases"] = risk_results["cases"][:max_indx + 1] hist_df["controls"] = risk_results["controls"][:max_indx + 1] hist_df["total"] = hist_df["cases"] + hist_df["controls"] hist_df.to_csv(options.stdout, sep="\t", index=None) # write footer and output benchmark information. E.Stop()