Python GWAS Exemples, cgat.GWAS Python Exemples

Exemple #1

0

Afficher le fichier

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--task",
                      dest="task",
                      type="choice",
                      choices=["get_hits", "extract_results", "merge_freq"],
                      help="task to perform")

    parser.add_option("--p-threshold",
                      dest="p_threshold",
                      type="float",
                      help="threshold for association p-value, below "
                      "which results will be output")

    parser.add_option("--output-directory",
                      dest="outdir",
                      type="string",
                      help="output file directory")

    parser.add_option("--snp-set",
                      dest="snpset",
                      type="string",
                      help="file containing list of SNP per row to "
                      "extract from GWAS results")

    parser.add_option(
        "--frequency-directory",
        dest="freq_dir",
        type="string",
        help="Directory containing plink .frq files corresponding"
        " to all chromosomes")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")
    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.task == "get_hits":
        hits = results.getHits(float(options.p_threshold))
        for name, region in hits:
            try:
                try:
                    top_reg = region.sort_values(by="CHISQ", ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
                except KeyError:
                    top_reg = region
                    top_reg.loc[:, "STAT"] = abs(top_reg["STAT"])
                    top_reg = top_reg.sort_values(by="STAT", ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
            except KeyError:
                top_reg = region
                top_reg.loc[:, "STAT"] = abs(top_reg["T"])
                top_reg = top_reg.sort_values(by="T", ascending=False)
                top_bp = top_reg.iloc[0]["BP"]
                top_snp = top_reg.iloc[0]["SNP"]

            outname = "_".join(
                ["chr%s" % str(name),
                 str(top_bp), top_snp, "significant"])

            outfile = outname + ".tsv"
            out_file = "/".join([options.outdir, outfile])
            E.info("output association results from Chr%s to %s" %
                   (str(name), out_file))
            # this keeps outputing the first column as unamed: 0,
            # need to remove this
            try:
                if region.columns[0] != "A1":
                    region.drop([region.columns[0]], inplace=True, axis=1)
            except:
                pass

            region.to_csv(out_file, sep="\t", index=None)

    elif options.task == "extract_results":
        with iotools.open_file(options.snpset, "r") as sfile:
            snpset = sfile.readlines()
            snpset = [snp.rstrip("\n") for snp in snpset]

        snp_df = results.extractSNPs(snpset)
        snp_df.dropna(axis=0, how='all', inplace=True)
        snp_df.drop_duplicates(subset=["SNP"], inplace=True)
        snp_df.to_csv(options.stdout, sep="\t", index=None)

    elif options.task == "merge_freq":
        # sequentially merge GWAS result with frequency data
        # to make file for GCTA joint analysis
        regex = re.compile("(\S+).frq$")
        cojo_df = results.mergeFrequencyResults(options.freq_dir,
                                                file_regex=regex)
        cojo_df.to_csv(options.stdout, sep="\t", index=None)
    else:
        pass

    # write footer and output benchmark information.
    E.stop()

Exemple #2

0

Afficher le fichier

Fichier : geno2geno.py Projet : cgat-developers/cgat-gwas

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option(
        "--task",
        dest="task",
        type="choice",
        choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"],
        help="task to perform")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink format .ped file")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink format .map file")

    parser.add_option("--freq-file",
                      dest="mafs",
                      type="string",
                      help="text file containing populations minor "
                      "allele frequencies of variants.  One row per "
                      "variant with ID MAF")

    parser.add_option("--groups-file",
                      dest="group_file",
                      type="string",
                      help="file containing group labels for individuals "
                      "in the provided ped file")

    parser.add_option("--ref-label",
                      dest="ref_label",
                      type="string",
                      help="group label to be used as the reference case")

    parser.add_option("--test-label",
                      dest="test_label",
                      type="string",
                      help="group label to be used as the test case")

    parser.add_option("--subset",
                      dest="subset",
                      type="choice",
                      choices=["cases", "gender"],
                      help="subset the "
                      "data by either case/control or gender")

    parser.add_option("--take-last",
                      dest="take",
                      action="store_true",
                      help="if use duplicates will take the last variant, "
                      "default behaviour is to take the first")

    parser.add_option("--outfile-pattern",
                      dest="out_pattern",
                      type="string",
                      help="outfile pattern to use for finding duplicates "
                      "and triallelic variants")

    parser.add_option("--snp-set",
                      dest="snp_subset",
                      type="string",
                      help="list of SNPs to include")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(mafs=None, subset=None, take_last=False)

    if options.task == "mafs":
        mafs = gwas.countByVariantAllele(options.ped_file, options.map_file)

        mafs.to_csv(options.stdout, index_col=None, sep="\t")

    elif options.task == "penetrance":
        summary, pens = gwas.calcPenetrance(options.ped_file,
                                            options.map_file,
                                            subset=options.subset,
                                            mafs=options.mafs,
                                            snpset=options.snp_subset)

        pens.to_csv(options.stdout, sep="\t", index_label="SNP")
        summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]),
                       sep="\t",
                       index_label="SNP")

    elif options.task == "allele_diff":
        allele_diffs = gwas.calcMaxAlleleFreqDiff(
            ped_file=options.ped_file,
            map_file=options.map_file,
            group_file=options.group_file,
            test=options.test_label,
            ref=options.ref_label)

        allele_diffs.to_csv(options.stdout, sep="\t")

    elif options.task == "detect_duplicates":
        # find variants with duplicated position and shared reference
        # allele indicative of triallelic variants - also same ID
        # ouput to a filter list
        infile = argv[-1]
        dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile,
                                                      take_last=options.take)

        if os.path.isabs(options.out_pattern):
            with open(options.out_pattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(options.out_pattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(options.out_pattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)
        else:
            outpattern = os.path.abspath(options.out_pattern)
            with open(outpattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(outpattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(outpattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)

    # write footer and output benchmark information.
    E.stop()

Exemple #3

0

Afficher le fichier

Fichier : snps2architecture.py Projet : cgat-developers/cgat-gwas

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=["cases_explained", "probability_phenotype"],
                      help="Which results to report, either the proportion "
                      "of cases explained or the probability of the "
                      "phenotype given the number of alleles carried")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink .map file with SNP positions")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink ped file with phenotype and "
                      "genotype data - A2 major allele coded")

    parser.add_option("--gwas-file",
                      dest="gwas",
                      type="string",
                      help="gwas results file, assumes Plink "
                      "output format.  Must contain SNP, BP, "
                      "OR column headers.  Assumes results relate "
                      "to the A1 allele")

    parser.add_option("--flip-alleles",
                      dest="flip",
                      action="store_true",
                      help="force alleles to flip if OR < 1")

    parser.add_option("--plot-statistic",
                      dest="plot_stat",
                      type="choice",
                      choices=["frequency", "cumulative"],
                      help="plot either cases frequency or cumulative "
                      "frequency of cases")

    parser.add_option("--plot-path",
                      dest="plot_path",
                      type="string",
                      help="save path for plot")

    parser.add_option("--flag-explained-recessive",
                      dest="explained",
                      action="store_true",
                      help="flag individuals explained by carriage of "
                      "2 risk alleles - NOT IMPLIMENTED")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # required files are .ped file, .map file and gwas results file
    E.info("reading GWAS results file: %s" % options.gwas)
    snp_df = pd.read_table(options.gwas, sep="\t", header=0, index_col=None)
    snp_list = snp_df["SNP"].values

    # parse ped file
    E.info("Reading ped file: %s" % options.ped_file)
    ped_df = gwas.parsePed(options.ped_file, compound_geno=True)

    # parse map file and get SNP indices that correspond to
    # ped file genotypes
    E.info("Fetching SNPs from map file: %s" % options.map_file)
    snp_index = gwas.getSNPs(options.map_file, snp_list)

    E.info("SNPs found: %i" % len(snp_index))
    # extract SNPs and ORs as key, value pairs
    or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list')
    snp_or = dict(zip(or_dict["SNP"], or_dict["OR"]))

    if options.flip:
        E.info("Flipping major alleles to risk alleles")
        flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index,
                                             snp_results=snp_or,
                                             genos=ped_df["GENOS"].tolist())
        # merge flipped genotypes with pedigree frame to get phenotypes
        geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"])
    else:
        # split genos into a dataframe
        genos = np.array(ped_df["GENOS"].tolist())
        geno_df = pd.DataFrame(genos, index=ped_df["FID"])

    merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID")

    # need to discount missing genotypes > 1%

    # frequencies of number of risk alleles by trait frequency
    E.info("count #risk alleles per individual")
    risk_results = gwas.countRiskAlleles(ped_frame=merged,
                                         snp_index=snp_index.values(),
                                         report=options.method,
                                         flag=options.explained)
    risk_freqs = risk_results["freqs"]
    cumulative = risk_results["cumulative"]
    # select results upto and including cumulative freq = 1.0
    max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0]
    max_freqs = risk_freqs[:max_indx + 1]
    max_cum = cumulative[:max_indx + 1]
    bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1]

    # plot!
    # need to add number of individuals into each bin as point size
    if options.plot_stat == "frequency":
        E.info("Generating plot of #risk alleles vs. P(Phenotype)")
        hist_df = gwas.plotRiskFrequency(
            bins=bins,
            frequencies=max_freqs,
            counts=risk_results["cases"][:max_indx + 1],
            savepath=options.plot_path,
            ytitle="P(Phenotype)")
    elif options.plot_stat == "cumulative":
        E.info("Generating plot of #risk alleles vs. cumulative frequency")
        hist_df = gwas.plotRiskFrequency(
            bins=bins,
            frequencies=max_cum,
            counts=risk_results["cases"][:max_indx + 1],
            savepath=options.plot_path,
            ytitle="Cumulative frequency cases")

    hist_df["freq"] = risk_results["freqs"][:max_indx + 1]
    hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1]
    hist_df["cases"] = risk_results["cases"][:max_indx + 1]
    hist_df["controls"] = risk_results["controls"][:max_indx + 1]
    hist_df["total"] = hist_df["cases"] + hist_df["controls"]
    hist_df.to_csv(options.stdout, sep="\t", index=None)

    # write footer and output benchmark information.
    E.stop()

Exemple #4

0

Afficher le fichier

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--score-method",
                      dest="method",
                      type="choice",
                      choices=[
                          "PICS", "LDscore", "ABF", "R2_rank", "get_eigen",
                          "calc_prior", "credible_set", "summarise"
                      ],
                      help="SNP scoring/prioritisation method to apply.")

    parser.add_option("--database",
                      dest="database",
                      type="string",
                      help="SQL database containing LD information "
                      "in table format. Expects columns SNP_A, "
                      "SNP_B, R2, BP_A and BP_B (Plink --r2 output)")

    parser.add_option("--ld-directory",
                      dest="ld_dir",
                      type="string",
                      help="directory containing tabix-index BGZIP "
                      "LD files.  Assumes Plink used to calculate LD")

    parser.add_option("--table-name",
                      dest="table",
                      type="string",
                      help="name of the SQL table containing the LD"
                      "values")

    parser.add_option("--chromosome",
                      dest="chromosome",
                      type="string",
                      help="chromosome to subset the association results "
                      "file on")

    parser.add_option("--ld-threshold",
                      dest="ld_threshold",
                      type="float",
                      help="the threshold of LD above which variants will "
                      "be taken forward.")

    parser.add_option("--rank-threshold",
                      dest="rank_threshold",
                      type="float",
                      help="the threshold in terms of the top n% SNPs to "
                      "output based on the ranking metric. e.g. "
                      "--rank-threshold=0.01 is the top 1% SNPs")

    parser.add_option("--credible-interval",
                      dest="interval",
                      type="float",
                      help="The credible set interval size to generate the "
                      "credible set of SNPs")

    parser.add_option("--prior-variance",
                      dest="prior_var",
                      type="float",
                      help="the prior variance used to weight the SNP "
                      "variance")

    parser.add_option("--fine-map-window",
                      dest="map_window",
                      type="int",
                      help="the region size to included around the index "
                      "SNP as the fine-mapping region.")

    parser.add_option("--eigen-score-directory",
                      dest="eigen_dir",
                      type="string",
                      help="PATH to directory containing tabix indexed "
                      "eigen score files")

    parser.add_option("--flat-prior",
                      dest="flat_prior",
                      action="store_true",
                      help="Ignore functional annotation information and "
                      "use an uninformative prior on each SNP")

    parser.add_option("--snp-set",
                      dest="snp_set",
                      type="string",
                      help="Pre-defined SNP set as a list of SNP IDs."
                      "If used to calculate priors contains column of scores.")

    parser.add_option(
        "--distribution",
        dest="dist",
        type="choice",
        choices=["normal", "t", "gamma", "lognormal", "exponential"],
        help="distribution from which to draw prior "
        "probabilities")

    parser.add_option("--distribution-parameters",
                      dest="dist_params",
                      type="string",
                      help="distribution parameters as a comma-separated list")

    parser.add_option("--lead-snp-id",
                      dest="lead_snp",
                      type="int",
                      help="0-based item number in filename")

    parser.add_option("--filename-separator",
                      dest="separator",
                      type="string",
                      help="filename separator to extract information")

    parser.add_option("--snp-column",
                      dest="snp_col",
                      type="int",
                      help="0-based index of SNP ID column number")

    parser.add_option("--probability-column",
                      dest="prob_col",
                      type="int",
                      help="0-based index of posterior probabilities column"
                      " number")

    parser.set_defaults(
        ld_dir=None,
        dist="normal",
        dist_params=None,
        snp_set=None,
        prior_var=0.04,
        interval=0.99,
        eigen_dir=None,
        map_window=100000,
        ld_threshold=0.5,
        database=None,
        table=None,
        flat_prior=False,
        lead_snp=2,
        separator="_",
        snp_col=0,
        prob_col=1,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infile = argv[-1]

    if len(infile.split(",")) > 1:
        pass
    else:
        peek = pd.read_table(infile, nrows=5, sep="\s*", header=0)
        try:
            if len(peek["TEST"] != "ADD"):
                clean = False
            else:
                clean = True
        except KeyError:
            clean = True

    if options.method == "LDscore":
        snpscores = gwas.snpPriorityScore(gwas_results=infile,
                                          database=options.database,
                                          table_name=options.table,
                                          chromosome=options.chromosome,
                                          ld_dir=options.ld_dir,
                                          clean=clean)
        # take top 1%, all SNPs doesn't achieve anything useful
        ranks = int(len(snpscores.index) * 0.01)
        snpscores = snpscores.iloc[:ranks]

    elif options.method == "PICS":
        snp_list = {}
        if options.snp_set and not options.flat_prior:
            with iotools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    try:
                        score = float(line.split("\t")[-1].rstrip("\n"))
                    except ValueError:
                        score = 0
                    snp_list[snp] = float(score)

            # get the parameter estimates for the distribution
            # if they have not been provided
            if not options.dist_params:
                dist_params = gwas.estimateDistributionParameters(
                    data=snp_list.values(), distribution=options.dist)
            else:
                dist_params = tuple(
                    [float(fx) for fx in options.dist_params.split(",")])

            E.info("Calculating priors on SNPs")
            priors = gwas.calcPriorsOnSnps(snp_list=snp_list,
                                           distribution=options.dist,
                                           params=dist_params)

        elif options.snp_set and options.flat_prior:
            with iotools.open_file(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    snp_list[snp] = 1.0

            priors = snp_list

        else:
            # allow for no priors or scores to be set,
            # use of priors will be ignored,
            # i.e. when prior and likelihood are not from
            # conjugate distributions
            priors = None

        # PICS scores expects the gwas results file to
        # only contain the region of interest, which
        # represents an independent association signal
        # if a SNP has not been genotyped,
        # but it is in strong LD, it will cause problems
        # downstream <- only allow SNPs that
        # are present in the analysis
        snpscores = gwas.PICSscore(gwas_results=infile,
                                   database=options.database,
                                   table_name=options.table,
                                   chromosome=options.chromosome,
                                   priors=priors,
                                   clean=clean,
                                   ld_dir=options.ld_dir,
                                   ld_threshold=options.ld_threshold)

        snpscores.columns = ["SNP", "PICS"]
        posterior_sum = 0
        snpscores.sort_values(ascending=False, inplace=True)
        post_snps = []
        for snp in snpscores.index:
            if posterior_sum < 99.0:
                posterior_sum += snpscores.loc[snp]
                post_snps.append(snp)
            else:
                break

        snpscores = snpscores.loc[post_snps]

        snpscores.drop_duplicates(inplace=True)

    elif options.method == "R2_rank":
        # rank SNPs based on their LD with the lead
        # SNP, take the top n% SNPs
        snpscores = gwas.LdRank(gwas_results=infile,
                                database=options.database,
                                table_name=options.table,
                                ld_dir=options.ld_dir,
                                chromosome=options.chromosome,
                                ld_threshold=options.ld_threshold,
                                top_snps=options.rank_threshold,
                                clean=clean)

    elif options.method == "ABF":
        snpscores = gwas.ABFScore(gwas_results=infile,
                                  region_size=options.map_window,
                                  chromosome=options.chromosome,
                                  prior_variance=options.prior_var,
                                  clean=clean)
    elif options.method == "get_eigen":
        E.info("Fetching Eigen scores")
        snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir,
                                        bim_file=infile,
                                        snp_file=options.snp_set)
        snpscores = pd.DataFrame(snpscores).T

    elif options.method == "credible_set":
        E.info("Creating credible set")

        snpscores = gwas.makeCredibleSet(probs_file=infile,
                                         credible_set=options.interval,
                                         lead_snp_indx=options.lead_snp,
                                         filename_sep=options.separator,
                                         snp_column=options.snp_col,
                                         probs_column=options.prob_col)

    elif options.method == "summarise":
        E.info("Collating SNP prioritisation resuslts")
        file_list = infile.split(",")
        snpscores = gwas.summariseResults(file_list=file_list)

    snpscores.to_csv(options.stdout, index_label="SNP", sep="\t")

    # write footer and output benchmark information.
    E.stop()