def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--task", dest="task", type="choice", choices=["get_hits", "extract_results", "merge_freq"], help="task to perform") parser.add_option("--p-threshold", dest="p_threshold", type="float", help="threshold for association p-value, below " "which results will be output") parser.add_option("--output-directory", dest="outdir", type="string", help="output file directory") parser.add_option("--snp-set", dest="snpset", type="string", help="file containing list of SNP per row to " "extract from GWAS results") parser.add_option( "--frequency-directory", dest="freq_dir", type="string", help="Directory containing plink .frq files corresponding" " to all chromosomes") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # if the input is a list of files, split them infile = argv[-1] infiles = infile.split(",") if len(infiles) > 1: results = gwas.GWASResults(assoc_file=infiles) elif len(infiles) == 1: results = gwas.GWASResults(assoc_file=infile) else: raise IOError("no input files detected, please specifiy association " "results files as the last command line argument") if options.task == "get_hits": hits = results.getHits(float(options.p_threshold)) for name, region in hits: try: try: top_reg = region.sort_values(by="CHISQ", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] except KeyError: top_reg = region top_reg.loc[:, "STAT"] = abs(top_reg["STAT"]) top_reg = top_reg.sort_values(by="STAT", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] except KeyError: top_reg = region top_reg.loc[:, "STAT"] = abs(top_reg["T"]) top_reg = top_reg.sort_values(by="T", ascending=False) top_bp = top_reg.iloc[0]["BP"] top_snp = top_reg.iloc[0]["SNP"] outname = "_".join( ["chr%s" % str(name), str(top_bp), top_snp, "significant"]) outfile = outname + ".tsv" out_file = "/".join([options.outdir, outfile]) E.info("output association results from Chr%s to %s" % (str(name), out_file)) # this keeps outputing the first column as unamed: 0, # need to remove this try: if region.columns[0] != "A1": region.drop([region.columns[0]], inplace=True, axis=1) except: pass region.to_csv(out_file, sep="\t", index=None) elif options.task == "extract_results": with iotools.open_file(options.snpset, "r") as sfile: snpset = sfile.readlines() snpset = [snp.rstrip("\n") for snp in snpset] snp_df = results.extractSNPs(snpset) snp_df.dropna(axis=0, how='all', inplace=True) snp_df.drop_duplicates(subset=["SNP"], inplace=True) snp_df.to_csv(options.stdout, sep="\t", index=None) elif options.task == "merge_freq": # sequentially merge GWAS result with frequency data # to make file for GCTA joint analysis regex = re.compile("(\S+).frq$") cojo_df = results.mergeFrequencyResults(options.freq_dir, file_regex=regex) cojo_df.to_csv(options.stdout, sep="\t", index=None) else: pass # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option( "--task", dest="task", type="choice", choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"], help="task to perform") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink format .ped file") parser.add_option("--map-file", dest="map_file", type="string", help="plink format .map file") parser.add_option("--freq-file", dest="mafs", type="string", help="text file containing populations minor " "allele frequencies of variants. One row per " "variant with ID MAF") parser.add_option("--groups-file", dest="group_file", type="string", help="file containing group labels for individuals " "in the provided ped file") parser.add_option("--ref-label", dest="ref_label", type="string", help="group label to be used as the reference case") parser.add_option("--test-label", dest="test_label", type="string", help="group label to be used as the test case") parser.add_option("--subset", dest="subset", type="choice", choices=["cases", "gender"], help="subset the " "data by either case/control or gender") parser.add_option("--take-last", dest="take", action="store_true", help="if use duplicates will take the last variant, " "default behaviour is to take the first") parser.add_option("--outfile-pattern", dest="out_pattern", type="string", help="outfile pattern to use for finding duplicates " "and triallelic variants") parser.add_option("--snp-set", dest="snp_subset", type="string", help="list of SNPs to include") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) parser.set_defaults(mafs=None, subset=None, take_last=False) if options.task == "mafs": mafs = gwas.countByVariantAllele(options.ped_file, options.map_file) mafs.to_csv(options.stdout, index_col=None, sep="\t") elif options.task == "penetrance": summary, pens = gwas.calcPenetrance(options.ped_file, options.map_file, subset=options.subset, mafs=options.mafs, snpset=options.snp_subset) pens.to_csv(options.stdout, sep="\t", index_label="SNP") summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]), sep="\t", index_label="SNP") elif options.task == "allele_diff": allele_diffs = gwas.calcMaxAlleleFreqDiff( ped_file=options.ped_file, map_file=options.map_file, group_file=options.group_file, test=options.test_label, ref=options.ref_label) allele_diffs.to_csv(options.stdout, sep="\t") elif options.task == "detect_duplicates": # find variants with duplicated position and shared reference # allele indicative of triallelic variants - also same ID # ouput to a filter list infile = argv[-1] dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile, take_last=options.take) if os.path.isabs(options.out_pattern): with open(options.out_pattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(options.out_pattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(options.out_pattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) else: outpattern = os.path.abspath(options.out_pattern) with open(outpattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(outpattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(outpattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--method", dest="method", type="choice", choices=["cases_explained", "probability_phenotype"], help="Which results to report, either the proportion " "of cases explained or the probability of the " "phenotype given the number of alleles carried") parser.add_option("--map-file", dest="map_file", type="string", help="plink .map file with SNP positions") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink ped file with phenotype and " "genotype data - A2 major allele coded") parser.add_option("--gwas-file", dest="gwas", type="string", help="gwas results file, assumes Plink " "output format. Must contain SNP, BP, " "OR column headers. Assumes results relate " "to the A1 allele") parser.add_option("--flip-alleles", dest="flip", action="store_true", help="force alleles to flip if OR < 1") parser.add_option("--plot-statistic", dest="plot_stat", type="choice", choices=["frequency", "cumulative"], help="plot either cases frequency or cumulative " "frequency of cases") parser.add_option("--plot-path", dest="plot_path", type="string", help="save path for plot") parser.add_option("--flag-explained-recessive", dest="explained", action="store_true", help="flag individuals explained by carriage of " "2 risk alleles - NOT IMPLIMENTED") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # required files are .ped file, .map file and gwas results file E.info("reading GWAS results file: %s" % options.gwas) snp_df = pd.read_table(options.gwas, sep="\t", header=0, index_col=None) snp_list = snp_df["SNP"].values # parse ped file E.info("Reading ped file: %s" % options.ped_file) ped_df = gwas.parsePed(options.ped_file, compound_geno=True) # parse map file and get SNP indices that correspond to # ped file genotypes E.info("Fetching SNPs from map file: %s" % options.map_file) snp_index = gwas.getSNPs(options.map_file, snp_list) E.info("SNPs found: %i" % len(snp_index)) # extract SNPs and ORs as key, value pairs or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list') snp_or = dict(zip(or_dict["SNP"], or_dict["OR"])) if options.flip: E.info("Flipping major alleles to risk alleles") flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index, snp_results=snp_or, genos=ped_df["GENOS"].tolist()) # merge flipped genotypes with pedigree frame to get phenotypes geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"]) else: # split genos into a dataframe genos = np.array(ped_df["GENOS"].tolist()) geno_df = pd.DataFrame(genos, index=ped_df["FID"]) merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID") # need to discount missing genotypes > 1% # frequencies of number of risk alleles by trait frequency E.info("count #risk alleles per individual") risk_results = gwas.countRiskAlleles(ped_frame=merged, snp_index=snp_index.values(), report=options.method, flag=options.explained) risk_freqs = risk_results["freqs"] cumulative = risk_results["cumulative"] # select results upto and including cumulative freq = 1.0 max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0] max_freqs = risk_freqs[:max_indx + 1] max_cum = cumulative[:max_indx + 1] bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1] # plot! # need to add number of individuals into each bin as point size if options.plot_stat == "frequency": E.info("Generating plot of #risk alleles vs. P(Phenotype)") hist_df = gwas.plotRiskFrequency( bins=bins, frequencies=max_freqs, counts=risk_results["cases"][:max_indx + 1], savepath=options.plot_path, ytitle="P(Phenotype)") elif options.plot_stat == "cumulative": E.info("Generating plot of #risk alleles vs. cumulative frequency") hist_df = gwas.plotRiskFrequency( bins=bins, frequencies=max_cum, counts=risk_results["cases"][:max_indx + 1], savepath=options.plot_path, ytitle="Cumulative frequency cases") hist_df["freq"] = risk_results["freqs"][:max_indx + 1] hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1] hist_df["cases"] = risk_results["cases"][:max_indx + 1] hist_df["controls"] = risk_results["controls"][:max_indx + 1] hist_df["total"] = hist_df["cases"] + hist_df["controls"] hist_df.to_csv(options.stdout, sep="\t", index=None) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--score-method", dest="method", type="choice", choices=[ "PICS", "LDscore", "ABF", "R2_rank", "get_eigen", "calc_prior", "credible_set", "summarise" ], help="SNP scoring/prioritisation method to apply.") parser.add_option("--database", dest="database", type="string", help="SQL database containing LD information " "in table format. Expects columns SNP_A, " "SNP_B, R2, BP_A and BP_B (Plink --r2 output)") parser.add_option("--ld-directory", dest="ld_dir", type="string", help="directory containing tabix-index BGZIP " "LD files. Assumes Plink used to calculate LD") parser.add_option("--table-name", dest="table", type="string", help="name of the SQL table containing the LD" "values") parser.add_option("--chromosome", dest="chromosome", type="string", help="chromosome to subset the association results " "file on") parser.add_option("--ld-threshold", dest="ld_threshold", type="float", help="the threshold of LD above which variants will " "be taken forward.") parser.add_option("--rank-threshold", dest="rank_threshold", type="float", help="the threshold in terms of the top n% SNPs to " "output based on the ranking metric. e.g. " "--rank-threshold=0.01 is the top 1% SNPs") parser.add_option("--credible-interval", dest="interval", type="float", help="The credible set interval size to generate the " "credible set of SNPs") parser.add_option("--prior-variance", dest="prior_var", type="float", help="the prior variance used to weight the SNP " "variance") parser.add_option("--fine-map-window", dest="map_window", type="int", help="the region size to included around the index " "SNP as the fine-mapping region.") parser.add_option("--eigen-score-directory", dest="eigen_dir", type="string", help="PATH to directory containing tabix indexed " "eigen score files") parser.add_option("--flat-prior", dest="flat_prior", action="store_true", help="Ignore functional annotation information and " "use an uninformative prior on each SNP") parser.add_option("--snp-set", dest="snp_set", type="string", help="Pre-defined SNP set as a list of SNP IDs." "If used to calculate priors contains column of scores.") parser.add_option( "--distribution", dest="dist", type="choice", choices=["normal", "t", "gamma", "lognormal", "exponential"], help="distribution from which to draw prior " "probabilities") parser.add_option("--distribution-parameters", dest="dist_params", type="string", help="distribution parameters as a comma-separated list") parser.add_option("--lead-snp-id", dest="lead_snp", type="int", help="0-based item number in filename") parser.add_option("--filename-separator", dest="separator", type="string", help="filename separator to extract information") parser.add_option("--snp-column", dest="snp_col", type="int", help="0-based index of SNP ID column number") parser.add_option("--probability-column", dest="prob_col", type="int", help="0-based index of posterior probabilities column" " number") parser.set_defaults( ld_dir=None, dist="normal", dist_params=None, snp_set=None, prior_var=0.04, interval=0.99, eigen_dir=None, map_window=100000, ld_threshold=0.5, database=None, table=None, flat_prior=False, lead_snp=2, separator="_", snp_col=0, prob_col=1, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) infile = argv[-1] if len(infile.split(",")) > 1: pass else: peek = pd.read_table(infile, nrows=5, sep="\s*", header=0) try: if len(peek["TEST"] != "ADD"): clean = False else: clean = True except KeyError: clean = True if options.method == "LDscore": snpscores = gwas.snpPriorityScore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, ld_dir=options.ld_dir, clean=clean) # take top 1%, all SNPs doesn't achieve anything useful ranks = int(len(snpscores.index) * 0.01) snpscores = snpscores.iloc[:ranks] elif options.method == "PICS": snp_list = {} if options.snp_set and not options.flat_prior: with iotools.open_file(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] try: score = float(line.split("\t")[-1].rstrip("\n")) except ValueError: score = 0 snp_list[snp] = float(score) # get the parameter estimates for the distribution # if they have not been provided if not options.dist_params: dist_params = gwas.estimateDistributionParameters( data=snp_list.values(), distribution=options.dist) else: dist_params = tuple( [float(fx) for fx in options.dist_params.split(",")]) E.info("Calculating priors on SNPs") priors = gwas.calcPriorsOnSnps(snp_list=snp_list, distribution=options.dist, params=dist_params) elif options.snp_set and options.flat_prior: with iotools.open_file(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] snp_list[snp] = 1.0 priors = snp_list else: # allow for no priors or scores to be set, # use of priors will be ignored, # i.e. when prior and likelihood are not from # conjugate distributions priors = None # PICS scores expects the gwas results file to # only contain the region of interest, which # represents an independent association signal # if a SNP has not been genotyped, # but it is in strong LD, it will cause problems # downstream <- only allow SNPs that # are present in the analysis snpscores = gwas.PICSscore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, priors=priors, clean=clean, ld_dir=options.ld_dir, ld_threshold=options.ld_threshold) snpscores.columns = ["SNP", "PICS"] posterior_sum = 0 snpscores.sort_values(ascending=False, inplace=True) post_snps = [] for snp in snpscores.index: if posterior_sum < 99.0: posterior_sum += snpscores.loc[snp] post_snps.append(snp) else: break snpscores = snpscores.loc[post_snps] snpscores.drop_duplicates(inplace=True) elif options.method == "R2_rank": # rank SNPs based on their LD with the lead # SNP, take the top n% SNPs snpscores = gwas.LdRank(gwas_results=infile, database=options.database, table_name=options.table, ld_dir=options.ld_dir, chromosome=options.chromosome, ld_threshold=options.ld_threshold, top_snps=options.rank_threshold, clean=clean) elif options.method == "ABF": snpscores = gwas.ABFScore(gwas_results=infile, region_size=options.map_window, chromosome=options.chromosome, prior_variance=options.prior_var, clean=clean) elif options.method == "get_eigen": E.info("Fetching Eigen scores") snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir, bim_file=infile, snp_file=options.snp_set) snpscores = pd.DataFrame(snpscores).T elif options.method == "credible_set": E.info("Creating credible set") snpscores = gwas.makeCredibleSet(probs_file=infile, credible_set=options.interval, lead_snp_indx=options.lead_snp, filename_sep=options.separator, snp_column=options.snp_col, probs_column=options.prob_col) elif options.method == "summarise": E.info("Collating SNP prioritisation resuslts") file_list = infile.split(",") snpscores = gwas.summariseResults(file_list=file_list) snpscores.to_csv(options.stdout, index_label="SNP", sep="\t") # write footer and output benchmark information. E.stop()