Python PipelineGWASの例

プログラミング言語: Python

クラス/型: PipelineGWAS

hotexamples.comのコード掲載数: 12

Python PipelineGWAS - 12件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのPipelineGWASの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

GCTA(2)

GWASResults(2)

FileGroup(2)

Plink2(2)

PlinkDev(2)

ABFScore(1)

parseFlashPCA(1)

getSNPs(1)

makeCredibleSet(1)

mergeQcExclusions(1)

plotPCA(1)

parsePed(1)

plotMapPhenotype(1)

flipRiskAlleles(1)

plotPhenotype(1)

plotRiskFrequency(1)

snpPriorityScore(1)

getEigenScores(1)

flagGender(1)

flagRelated(1)

flagInbred(1)

flagExcessHets(1)

findDuplicateVariants(1)

estimateDistributionParameters(1)

countRiskAlleles(1)

countByVariantAllele(1)

calcPriorsOnSnps(1)

calcPenetrance(1)

calcMaxAlleleFreqDiff(1)

PICSscore(1)

LdRank(1)

summariseResults(1)

コード例 #1

ファイルを表示

ファイル: snpPriority.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--score-method", dest="method", type="choice",
                      choices=["PICS", "LDscore", "ABF", "R2_rank",
                               "get_eigen", "calc_prior", "credible_set",
                               "summarise"],
                      help="SNP scoring/prioritisation method to apply.")

    parser.add_option("--database", dest="database", type="string",
                      help="SQL database containing LD information "
                      "in table format. Expects columns SNP_A, "
                      "SNP_B, R2, BP_A and BP_B (Plink --r2 output)")

    parser.add_option("--ld-directory", dest="ld_dir", type="string",
                      help="directory containing tabix-index BGZIP "
                      "LD files.  Assumes Plink used to calculate LD")

    parser.add_option("--table-name", dest="table", type="string",
                      help="name of the SQL table containing the LD"
                      "values")

    parser.add_option("--chromosome", dest="chromosome", type="string",
                      help="chromosome to subset the association results "
                      "file on")

    parser.add_option("--ld-threshold", dest="ld_threshold", type="float",
                      help="the threshold of LD above which variants will "
                      "be taken forward.")

    parser.add_option("--rank-threshold", dest="rank_threshold", type="float",
                      help="the threshold in terms of the top n% SNPs to "
                      "output based on the ranking metric. e.g. "
                      "--rank-threshold=0.01 is the top 1% SNPs")

    parser.add_option("--credible-interval", dest="interval", type="float",
                      help="The credible set interval size to generate the "
                      "credible set of SNPs")

    parser.add_option("--prior-variance", dest="prior_var", type="float",
                      help="the prior variance used to weight the SNP "
                      "variance")

    parser.add_option("--fine-map-window", dest="map_window", type="int",
                      help="the region size to included around the index "
                      "SNP as the fine-mapping region.")

    parser.add_option("--eigen-score-directory", dest="eigen_dir", type="string",
                      help="PATH to directory containing tabix indexed "
                      "eigen score files")

    parser.add_option("--flat-prior", dest="flat_prior", action="store_true",
                      help="Ignore functional annotation information and "
                      "use an uninformative prior on each SNP")

    parser.add_option("--snp-set", dest="snp_set", type="string",
                      help="Pre-defined SNP set as a list of SNP IDs."
                      "If used to calculate priors contains column of scores.")

    parser.add_option("--distribution", dest="dist", type="choice",
                      choices=["normal", "t", "gamma", "lognormal",
                               "exponential"],
                      help="distribution from which to draw prior "
                      "probabilities")

    parser.add_option("--distribution-parameters", dest="dist_params", type="string",
                      help="distribution parameters as a comma-separated list")

    parser.add_option("--lead-snp-id", dest="lead_snp", type="int",
                      help="0-based item number in filename")

    parser.add_option("--filename-separator", dest="separator", type="string",
                      help="filename separator to extract information")

    parser.add_option("--snp-column", dest="snp_col", type="int",
                      help="0-based index of SNP ID column number")

    parser.add_option("--probability-column", dest="prob_col", type="int",
                      help="0-based index of posterior probabilities column"
                      " number")

    parser.set_defaults(ld_dir=None,
                        dist="normal",
                        dist_params=None,
                        snp_set=None,
                        prior_var=0.04,
                        interval=0.99,
                        eigen_dir=None,
                        map_window=100000,
                        ld_threshold=0.5,
                        database=None,
                        table=None,
                        flat_prior=False,
                        lead_snp=2,
                        separator="_",
                        snp_col=0,
                        prob_col=1,
                        )                        
                        
    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    if len(infile.split(",")) > 1:
        pass
    else:
        peek = pd.read_table(infile, nrows=5, sep="\s*", header=0)
        try:
            if len(peek["TEST"] != "ADD"):
                clean = False
            else:
                clean = True
        except KeyError:
            clean = True

    if options.method == "LDscore":
        snpscores = gwas.snpPriorityScore(gwas_results=infile,
                                          database=options.database,
                                          table_name=options.table,
                                          chromosome=options.chromosome,
                                          ld_dir=options.ld_dir,
                                          clean=clean)
        # take top 1%, all SNPs doesn't achieve anything useful
        ranks = int(len(snpscores.index) * 0.01)
        snpscores = snpscores.iloc[:ranks]
        
    elif options.method == "PICS":
        snp_list = {}
        if options.snp_set and  not options.flat_prior:
            with IOTools.openFile(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    try:
                        score = float(line.split("\t")[-1].rstrip("\n"))
                    except ValueError:
                        score = 0
                    snp_list[snp] = float(score)

            # get the parameter estimates for the distribution
            # if they have not been provided
            if not options.dist_params:
                dist_params = gwas.estimateDistributionParameters(data=snp_list.values(),
                                                                  distribution=options.dist)
            else:
                dist_params = tuple([float(fx) for fx in options.dist_params.split(",")])

            
            E.info("Calculating priors on SNPs")
            priors = gwas.calcPriorsOnSnps(snp_list=snp_list,
                                           distribution=options.dist,
                                           params=dist_params)

        elif options.snp_set and options.flat_prior:
            with IOTools.openFile(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    snp_list[snp] = 1.0
            
            priors = snp_list

        else:
            # allow for no priors or scores to be set,
            # use of priors will be ignored,
            # i.e. when prior and likelihood are not from
            # conjugate distributions
            priors = None

        # PICS scores expects the gwas results file to
        # only contain the region of interest, which
        # represents an independent association signal
        snpscores = gwas.PICSscore(gwas_results=infile,
                                   database=options.database,
                                   table_name=options.table,
                                   chromosome=options.chromosome,
                                   priors=priors,
                                   clean=clean,
                                   ld_dir=options.ld_dir,
                                   ld_threshold=options.ld_threshold)

        snpscores.columns = ["SNP", "PICS"]
        posterior_sum = 0
        snpscores.sort_values(ascending=False,
                              inplace=True)
        post_snps = []
        for snp in snpscores.index:
            if posterior_sum < 99.0:
                posterior_sum += snpscores.loc[snp]
                post_snps.append(snp)
            else:
                break

        snpscores = snpscores.loc[post_snps]

        snpscores.drop_duplicates(inplace=True)
        

    elif options.method == "R2_rank":
        # rank SNPs based on their LD with the lead
        # SNP, take the top n% SNPs
        snpscores = gwas.LdRank(gwas_results=infile,
                                database=options.database,
                                table_name=options.table,
                                ld_dir=options.ld_dir,
                                chromosome=options.chromosome,
                                ld_threshold=options.ld_threshold,
                                top_snps=options.rank_threshold,
                                clean=clean)

    elif options.method == "ABF":
        snpscores = gwas.ABFScore(gwas_results=infile,
                                  region_size=options.map_window,
                                  chromosome=options.chromosome,
                                  prior_variance=options.prior_var,
                                  clean=clean)
    elif options.method == "get_eigen":
        E.info("Fetching Eigen scores")
        snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir,
                                        bim_file=infile,
                                        snp_file=options.snp_set)
        snpscores = pd.DataFrame(snpscores).T

    elif options.method == "credible_set":
        E.info("Creating credible set")
        
        snpscores = gwas.makeCredibleSet(probs_file=infile,
                                         credible_set=options.interval,
                                         lead_snp_indx=options.lead_snp,
                                         filename_sep=options.separator,
                                         snp_column=options.snp_col,
                                         probs_column=options.prob_col)

    elif options.method == "summarise":
        E.info("Collating SNP prioritisation resuslts")
        file_list = infile.split(",")
        snpscores = gwas.summariseResults(file_list=file_list)

    snpscores.to_csv(options.stdout, index_label="SNP",
                     sep="\t")

    # write footer and output benchmark information.
    E.Stop()

コード例 #2

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--program",
                      dest="program",
                      type="choice",
                      choices=["plink2", "gcta", "plinkdev"],
                      help="program to execute genome-wide analysis")

    parser.add_option("--input-file-pattern",
                      dest="infile_pattern",
                      type="string",
                      help="file prefix that identifies a group of files")

    parser.add_option("--input-file-format",
                      dest="file_format",
                      type="choice",
                      choices=[
                          "plink", "plink_binary", "oxford", "oxford_binary",
                          "vcf", "GRM_binary", "GRM_gz"
                      ],
                      help="format of input files")

    parser.add_option("--phenotypes-file",
                      dest="pheno_file",
                      type="string",
                      help="text file of additional phenotypes")

    parser.add_option("--pheno",
                      dest="pheno",
                      type="string",
                      help="either phenotype file column header or number")

    parser.add_option("--covariates-file",
                      dest="covariate_file",
                      type="string",
                      help="file containing covariates")

    parser.add_option("--covariate-column",
                      dest="covar_col",
                      type="string",
                      help="column number(s) or header(s) to include in "
                      "association model")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=[
                          "ld_prune", "summary", "flag_hets",
                          "remove_relations", "check_gender", "IBD"
                      ],
                      help="method to apply to genome-wide data")

    parser.add_option("--IBD-parameter",
                      dest="ibd_param",
                      type="choice",
                      choices=["norm", "relatives", "full"],
                      help="param "
                      "to pass to IBD calculations")

    parser.add_option("--principal-components",
                      dest="num_pcs",
                      type="int",
                      help="the number of principal components to output")

    parser.add_option("--matrix-shape",
                      dest="matrix_shape",
                      type="choice",
                      choices=["triangle", "square", "square0"],
                      help="output matrix shape.",
                      default="triangle")

    parser.add_option("--matrix-compression",
                      dest="matrix_compress",
                      type="choice",
                      choices=["gz", "bin", "bin4"],
                      help="compression to apply to output matrix file",
                      default="gz")

    parser.add_option("--matrix-form",
                      dest="matrix_form",
                      type="choice",
                      choices=["distance", "grm"],
                      help="type of relationship matrix to calculate")

    parser.add_option(
        "--matrix-metric",
        dest="matrix_metric",
        type="choice",
        choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"],
        help="value to calculate for diagonal elements of the "
        "grm. Default is fhat for grm and hamming for distance.")

    parser.add_option(
        "--matrix-options",
        dest="matrix_options",
        type="string",
        help="modifiers of matrix output, see plink documentation "
        "for details")

    parser.add_option("--strand-flip-subset",
                      dest="flip_subset",
                      action="store_true",
                      help="apply strand flipping to a subset of samples")

    parser.add_option("--flip-scan-type",
                      dest="scan_param",
                      type="choice",
                      choices=["default", "window", "threshold"],
                      help="strand flipping scan to apply to SNPs")

    parser.add_option("--sort-type",
                      dest="sort_type",
                      type="choice",
                      choices=["none", "natural", "ascii", "file"],
                      help="sort type to input files")

    parser.add_option("--merge-file-format",
                      dest="merge_format",
                      type="choice",
                      choices=["plink", "binary_plink"],
                      help="format of input files to be merged")

    parser.add_option(
        "--merge-mode",
        dest="merge_mode",
        type="choice",
        choices=[
            "default", "original_missing", "new_nonmissing", "no_overwrite",
            "force", "report_all", "report_nonmissing"
        ],
        help="merge mode to apply to dealing with merge conflicts")

    parser.add_option("--duplicates-method",
                      dest="dup_method",
                      type="choice",
                      choices=["same_ref", "id_match", "suppress_first"],
                      help="method for identifying and dealing with duplicate "
                      "variants")

    parser.add_option("--summary-method",
                      dest="summary_method",
                      type="choice",
                      choices=[
                          "allele_frequency", "missing_data", "hardy_weinberg",
                          "mendel_errors", "inbreeding", "inbreeding_coef",
                          "gender_checker", "wrights_fst"
                      ],
                      help="summary statistics to calculate")

    parser.add_option("--summary-parameter",
                      dest="sum_param",
                      type="string",
                      help="optional parameters that can be passed to summary "
                      "statistics methods")

    parser.add_option(
        "--genotype-rate",
        dest="filt_genotype_rate",
        type="string",
        help="genotyping rate threshold.  SNPs below this threshold "
        "will be excluded from analysis")

    parser.add_option("--indiv-missing",
                      dest="filt_missingness",
                      type="string",
                      help="individual missingness rate.  Individuals below "
                      "this threshold will be excluded from analysis")

    parser.add_option("--hardy-weinberg",
                      dest="filt_hwe",
                      type="string",
                      help="hardy-weinberg p-value threshold for SNPs.  SNPs "
                      "with a 2df chisquared p-value below this will be "
                      "filtered out")

    parser.add_option(
        "--min-allele-frequency",
        dest="filt_min_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or above this threshold")

    parser.add_option(
        "--max-allele-frequency",
        dest="filt_max_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or below this threshold")

    parser.add_option(
        "--mendelian-error",
        dest="filt_mendelian_error",
        type="string",
        help="exclude individuals/trios with mendelian errors that "
        "exceed this value")

    parser.add_option("--min-quality-score",
                      dest="filt_min_qaul_score",
                      type="string",
                      help="reset the minimum low bound of quality scores for "
                      "variants in a VCF file.  Default is 0")

    parser.add_option(
        "--max-quality-score",
        dest="filt_max_qual_score",
        type="string",
        help="reset the maximum upper bound of quality scores for "
        "a VCCF file.  Default is Inf")

    parser.add_option("--allow-no-gender",
                      dest="filt_allow_no_sex",
                      type="string",
                      help="allow individuals with gender missing")

    parser.add_option("--enforce-gender",
                      dest="filt_enforce_sex",
                      type="string",
                      help="only include individuals with non-missing gender "
                      "information")

    parser.add_option("--keep-individuals",
                      dest="filt_keep",
                      type="string",
                      help="a file containing individuals IDs to keep, "
                      "one per row")

    parser.add_option("--remove-individuals",
                      dest="filt_remove",
                      type="string",
                      help="a file of individual IDs to remove, one per row")

    parser.add_option("--subset-filter",
                      dest="filt_subset_filter",
                      type="choice",
                      choices=[
                          "cases", "controls", "males", "females", "founders",
                          "nonfounders"
                      ],
                      help="only apply filters to the specific subset of "
                      "individuals supplied")

    parser.add_option(
        "--extract-snps",
        dest="filt_extract",
        type="string",
        help="text file of variant IDs to include in the analysis, "
        "ignoring all others")

    parser.add_option("--exclude-snps",
                      dest="filt_exclude",
                      type="string",
                      help="a file of variant IDs to exclude from analysis")

    parser.add_option("--restrict-chromosome",
                      dest="filt_chromosome",
                      type="string",
                      help="restict analysis to either a single chromosome, "
                      "or a comma-separated list of chromosomes")

    parser.add_option("--exclude-chromosomes",
                      dest="filt_exclude_chromosome",
                      type="string",
                      help="exclude all variants on these "
                      "chromosome(s)")

    parser.add_option(
        "--autosome-only",
        dest="filt_autosome",
        action="store_true",
        help="if present only autosomal variants will be analysed")

    parser.add_option(
        "--pseudo-autosome",
        dest="filt_pseudo_autosome",
        action="store_true",
        help="include on the pseudo-autosomal region of chromosome X")

    parser.add_option("--ignore-indels",
                      dest="filt_ignore_indels",
                      action="store_true",
                      help="only include bi-allelic single nucleotide "
                      "variants in analysis")

    parser.add_option(
        "--snp-range",
        dest="filt_snp_bp_range",
        type="string",
        help="comma separated list of from, to genome co-ordinates "
        "within which to include variants for analysis")

    parser.add_option("--snp-id-range",
                      dest="filt_snp_id_range",
                      type="string",
                      help="comma separate list of IDs from, to within which "
                      "to include variants for analysis.")

    parser.add_option("--snp-id",
                      dest="filt_specific_snp",
                      type="string",
                      help="include a single snp in the analysis given by "
                      "it's variant ID.")

    parser.add_option("--exclude-variant",
                      dest="filt_exclude_snp",
                      type="string",
                      help="exclude a single variant from the analysis, "
                      "given by it's variant ID")

    parser.add_option(
        "--covariate-filter",
        dest="filt_covariate_filter",
        type="string",
        help="covariate column headers or column numbers on which "
        "to filter on. Requries --covariate-file")

    parser.add_option(
        "--filter-parameter",
        dest="param",
        type="string",
        help="parameter values to be passed to filtering function")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="string",
                      help="alters the behaviour of the --snp-range and "
                      "--include/exclude snp options.  variants within +/- "
                      "half * window_size (kb) are included")

    parser.add_option(
        "--range-resolution",
        dest="filt_range_resolution",
        type="choice",
        choices=["bp", "kb", "mb"],
        help="alters the (from, to) range resolution to either bp, "
        "kb or mb")

    parser.add_option(
        "--output-file-pattern",
        dest="out_pattern",
        type="string",
        help="output file pattern prefix. file suffixes are dependent "
        "on the task executed")

    parser.add_option("--threads",
                      dest="threads",
                      type="int",
                      help="the number of threads to use for multi-threaded "
                      "processes")

    parser.add_option("--use-kb",
                      dest="kb",
                      action="store_true",
                      help="if present uses a kb sized window for LD pruning")

    parser.add_option("--prune-method",
                      dest="prune_method",
                      type="choice",
                      choices=["R2", "VIF"],
                      help="type of LD pruning to "
                      "perform, pair-wise LD or variance inflation factor")

    parser.add_option("--step-size",
                      dest="step",
                      type="string",
                      help="step size to advance window by")

    parser.add_option("--threshold",
                      dest="threshold",
                      type="string",
                      help="threshold on which to filter results")

    parser.add_option("--parallel",
                      dest="parallel",
                      type="int",
                      help="number of jobs to split task into")

    parser.add_option("--memory",
                      dest="memory",
                      type="string",
                      help="amount of memory to reserve for the task")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(sum_param=None,
                        dup_method="same_ref",
                        matrix_shape="triangle",
                        matrix_options=None,
                        matrix_compress="gz",
                        kb=False,
                        random_seed=random.randint(0, 19999),
                        memory="60G",
                        parallel=None)

    if not options.infile_pattern:
        infiles = (argv[-1]).split(",")
    else:
        infiles = options.infile_pattern

    # create a new filegroup object
    geno_files = gwas.FileGroup(files=infiles,
                                file_format=options.file_format,
                                genotype_format="imputed")
    if options.pheno_file:
        geno_files.set_phenotype(pheno_file=options.pheno_file,
                                 pheno=options.pheno)
    else:
        pass

    # add FileGroup object to the gwas program object
    if options.program == "plink2":
        gwas_object = gwas.Plink2(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "plinkdev":
        gwas_object = gwas.PlinkDev(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)

    elif options.program == "gcta":
        gwas_object = gwas.GCTA(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    else:
        pass

    # collect filtering options from options
    opt_dict = options.__dict__
    filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)]
    filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]}

    # iteratively add all filters to GWASProgram object
    for fkey in filter_dict:
        filt_key = fkey.replace("filt_", "")
        filter_value = filter_dict[fkey]
        gwas_object.apply_filters(filter_type=filt_key,
                                  filter_value=filter_value)

    # handle summary statistics
    if options.method == "ld_prune":
        gwas_object._qc_methods(ld_prune=options.prune_method,
                                kb=True,
                                window=options.window_size,
                                step=options.step,
                                threshold=options.threshold)
    elif options.method == "IBD":
        # use sum param to pass arguments to ibd estiamte
        # these are norm, full or relatitves
        gwas_object._qc_methods(ibd=options.ibd_param)
    elif options.method == "summary":
        if options.summary_method == "allele_frequency":
            gwas_object._output_statistics(allele_frequency=options.sum_param)
        elif options.summary_method == "hardy_weinberg":
            gwas_object._output_statistics(hardy_weinberg=options.sum_param)
        elif options.summary_method == "missing_data":
            gwas_object._output_statistics(missing_data=options.sum_param)
        elif options.summary_method == "mendel_errors":
            gwas_object._output_statistics(mendel_errors=options.sum_param)
        elif options.summary_method == "inbreeding":
            gwas_object._output_statistics(inbreeding=options.sum_param)
        elif options.summary_method == "inbreeding_coef":
            gwas_object._output_statistics(inbreeding_coef=options.sum_param)
        elif options.summary_method == "gender_checker":
            gwas_object._output_statistics(gender_checker=options.sum_param)
        elif options.summary_method == "wrights_fst":
            gwas_object._output_statistics(wrights_fst=options.sum_param)
        else:
            pass
    elif options.method == "remove_relations":
        gwas_object._run_tasks(remove_relations="cutoff",
                               parameter=options.threshold)
    elif options.method == "check_gender":
        gwas_object._run_tasks(check_gender="")
    else:
        pass

    gwas_object.build_statement(infiles=geno_files,
                                outfile=options.out_pattern,
                                threads=options.threads,
                                memory=options.memory,
                                parallel=options.parallel)

    # write footer and output benchmark information.
    E.Stop()

コード例 #3

ファイルを表示

ファイル: pheno2plot.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("-p", "--plot-type", dest="plot_type", type="choice",
                      choices=["histogram", "barplot", "density",
                               "boxplot", "scatter", "map",
                               "pca"],
                      help="the plot type to generate")

    parser.add_option("--plot-n-pc", dest="n_pcs", type="int",
                      help="The number of principal components to "
                      "plot")

    parser.add_option("-g", "--group-by", dest="group_by", type="string",
                      help="column header to group observations by")

    parser.add_option("-x", "--x-column", dest="x_col", type="string",
                      help="column to plot on X axis")

    parser.add_option("-y", "--y-column", dest="y_col", type="string",
                      help="column to plot on y axis")

    parser.add_option("-i", "--index_column", dest="indx", type="string",
                      help="column number that refers to the row index")
   
    parser.add_option("--output-file", dest="outfile", type="string",
                      help="path and filename to save plot to")

    parser.add_option("--labels", dest="labels", type="string",
                      help="a comma-separated list of axis labels. "
                      "The first 2 correspond to the X and Y-axis, "
                      "respectively, and the third is the plot title")

    parser.add_option("--metadata-file", dest="meta_file", type="string",
                      help="file containing metadata for annotating "
                      "plots with. Use `--group-labels` to define table "
                      "columns to use")

    parser.add_option("--fam-file", dest="fam_file", type="string",
                      help="Plink .fam file containing file IDs")

    parser.add_option("--xvar-labels", dest="xvar_labs", type="string",
                      help="a comma-separated list of variable X labels"
                      "only applies when X is a discrete or categorical "
                      "variable. The labels must be in the correct order")

    parser.add_option("--group-labels", dest="group_labs", type="string",
                      help="a comma-separated list of grouping variable "
                      "labels.  Can only apply when the grouping variable "
                      "is discrete or categorical.  The labels must be "
                      "input in the order of the data")

    parser.add_option("--yvar-labels", dest="yvar_labs", type="string",
                      help="a comma-separated list of variable Y labels"
                      "only applies when Y is a discrete or categorical "
                      "variable")

    parser.add_option("--var-type", dest="var_type", type="choice",
                      choices=["continuous", "categorical", "integer"],
                      help="The data type of the variables to be plotted."
                      "The default is continuous")

    parser.add_option("--coordinate-file", dest="coordinates", type="string",
                      help="file containing co-ordinates data")

    parser.add_option("--coords-id-col", dest="coord_ids", type="string",
                      help="column header containing individual IDs")

    parser.add_option("--lattitude-column", dest="lat_col", type="string",
                      help="column header containing lattitude co-ordinates")

    parser.add_option("--longitude-column", dest="long_col", type="string",
                      help="column header containing longitude co-ordinates")    

    parser.add_option("--reference-value", dest="ref_val", type="string",
                      help="categorical variable level to dichotomise on")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(y_col=None,
                        group_by=None,
                        indx=None,
                        labels="X,Y,title",
                        xvar_labs=None,
                        yvar_labs=None,
                        var_type="continuous")
    infile = argv[-1]

    df = pd.read_table(infile, sep="\t", index_col=options.indx,
                       header=0)

    if options.plot_type == "map":
        df = pd.read_table(infile, sep="\t", index_col=options.indx,
                           header=0)

        coords_df = pd.read_table(options.coordinates, sep="\t",
                                  header=0, index_col=options.indx)
        gwas.plotMapPhenotype(data=df,
                              coords=coords_df,
                              coord_id_col=options.coord_ids,
                              lat_col=options.lat_col,
                              long_col=options.long_col,
                              save_path=options.outfile,
                              xvar=options.x_col,
                              var_type=options.var_type,
                              xlabels=options.xvar_labs,
                              level=options.ref_val)

    elif options.plot_type == "pca":
        data = gwas.parseFlashPCA(pcs_file=infile,
                                  fam_file=options.fam_file)

        gwas.plotPCA(data=data,
                     nPCs=options.n_pcs,
                     point_labels=options.group_labs,
                     save_path=options.outfile,
                     headers=False,
                     metadata=options.meta_file,
                     multiplot=True)
    else:
        df = pd.read_table(infile, sep="\t", index_col=options.indx,
                           header=0)

        gwas.plotPhenotype(data=df,
                           plot_type=options.plot_type,
                           x=options.x_col,
                           y=options.y_col,
                           group=options.group_by,
                           save_path=options.outfile,
                           labels=options.labels,
                           xlabels=options.xvar_labs,
                           ylabels=options.yvar_labs,
                           glabels=options.group_labs,
                           var_type=options.var_type)
        

    # write footer and output benchmark information.
    E.Stop()

コード例 #4

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--task",
                      dest="task",
                      type="choice",
                      choices=["get_hits", "extract_results", "merge_freq"],
                      help="task to perform")

    parser.add_option("--p-threshold",
                      dest="p_threshold",
                      type="float",
                      help="threshold for association p-value, below "
                      "which results will be output")

    parser.add_option("--output-directory",
                      dest="outdir",
                      type="string",
                      help="output file directory")

    parser.add_option("--snp-set",
                      dest="snpset",
                      type="string",
                      help="file containing list of SNP per row to "
                      "extract from GWAS results")

    parser.add_option(
        "--frequency-directory",
        dest="freq_dir",
        type="string",
        help="Directory containing plink .frq files corresponding"
        " to all chromosomes")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")
    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.task == "get_hits":
        hits = results.getHits(float(options.p_threshold))
        for name, region in hits:
            try:
                try:
                    top_reg = region.sort_values(by="CHISQ", ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
                except KeyError:
                    top_reg = region
                    top_reg.loc[:, "STAT"] = abs(top_reg["STAT"])
                    top_reg = top_reg.sort_values(by="STAT", ascending=False)
                    top_bp = top_reg.iloc[0]["BP"]
                    top_snp = top_reg.iloc[0]["SNP"]
            except KeyError:
                top_reg = region
                top_reg.loc[:, "STAT"] = abs(top_reg["T"])
                top_reg = top_reg.sort_values(by="T", ascending=False)
                top_bp = top_reg.iloc[0]["BP"]
                top_snp = top_reg.iloc[0]["SNP"]

            outname = "_".join(
                ["chr%s" % str(name),
                 str(top_bp), top_snp, "significant"])

            outfile = outname + ".tsv"
            out_file = "/".join([options.outdir, outfile])
            E.info("output association results from Chr%s to %s" %
                   (str(name), out_file))
            # this keeps outputing the first column as unamed: 0,
            # need to remove this
            try:
                region.drop([region.columns[0]], inplace=True, axis=1)
            except:
                pass

            region.to_csv(out_file, sep="\t", index=None)

    elif options.task == "extract_results":
        with IOTools.openFile(options.snpset, "r") as sfile:
            snpset = sfile.readlines()
            snpset = [snp.rstrip("\n") for snp in snpset]

        snp_df = results.extractSNPs(snpset)
        snp_df.dropna(axis=0, how='all', inplace=True)
        snp_df.drop_duplicates(subset=["SNP"], inplace=True)
        snp_df.to_csv(options.stdout, sep="\t", index=None)

    elif options.task == "merge_freq":
        # sequentially merge GWAS result with frequency data
        # to make file for GCTA joint analysis
        regex = re.compile("(\S+).frq$")
        cojo_df = results.mergeFrequencyResults(options.freq_dir,
                                                file_regex=regex)
        cojo_df.to_csv(options.stdout, sep="\t", index=None)
    else:
        pass

    # write footer and output benchmark information.
    E.Stop()

コード例 #5

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option(
        "--task",
        dest="task",
        type="choice",
        choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"],
        help="task to perform")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink format .ped file")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink format .map file")

    parser.add_option("--freq-file",
                      dest="mafs",
                      type="string",
                      help="text file containing populations minor "
                      "allele frequencies of variants.  One row per "
                      "variant with ID MAF")

    parser.add_option("--groups-file",
                      dest="group_file",
                      type="string",
                      help="file containing group labels for individuals "
                      "in the provided ped file")

    parser.add_option("--ref-label",
                      dest="ref_label",
                      type="string",
                      help="group label to be used as the reference case")

    parser.add_option("--test-label",
                      dest="test_label",
                      type="string",
                      help="group label to be used as the test case")

    parser.add_option("--subset",
                      dest="subset",
                      type="choice",
                      choices=["cases", "gender"],
                      help="subset the "
                      "data by either case/control or gender")

    parser.add_option("--take-last",
                      dest="take",
                      action="store_true",
                      help="if use duplicates will take the last variant, "
                      "default behaviour is to take the first")

    parser.add_option("--outfile-pattern",
                      dest="out_pattern",
                      type="string",
                      help="outfile pattern to use for finding duplicates "
                      "and triallelic variants")

    parser.add_option("--snp-set",
                      dest="snp_subset",
                      type="string",
                      help="list of SNPs to include")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(mafs=None, subset=None, take_last=False)

    if options.task == "mafs":
        mafs = gwas.countByVariantAllele(options.ped_file, options.map_file)

        mafs.to_csv(options.stdout, index_col=None, sep="\t")

    elif options.task == "penetrance":
        summary, pens = gwas.calcPenetrance(options.ped_file,
                                            options.map_file,
                                            subset=options.subset,
                                            mafs=options.mafs,
                                            snpset=options.snp_subset)

        pens.to_csv(options.stdout, sep="\t", index_label="SNP")
        summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]),
                       sep="\t",
                       index_label="SNP")

    elif options.task == "allele_diff":
        allele_diffs = gwas.calcMaxAlleleFreqDiff(
            ped_file=options.ped_file,
            map_file=options.map_file,
            group_file=options.group_file,
            test=options.test_label,
            ref=options.ref_label)

        allele_diffs.to_csv(options.stdout, sep="\t")

    elif options.task == "detect_duplicates":
        # find variants with duplicated position and shared reference
        # allele indicative of triallelic variants - also same ID
        # ouput to a filter list
        infile = argv[-1]
        dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile,
                                                      take_last=options.take)

        if os.path.isabs(options.out_pattern):
            with open(options.out_pattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(options.out_pattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(options.out_pattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)
        else:
            outpattern = os.path.abspath(options.out_pattern)
            with open(outpattern + ".triallelic", "w") as otfile:
                for tvar in tris:
                    otfile.write("%s\n" % tvar)

            with open(outpattern + ".duplicates", "w") as odfile:
                for dvar in dups:
                    odfile.write("%s\n" % dvar)

            with open(outpattern + ".overlapping", "w") as ovfile:
                for ovar in oves:
                    ovfile.write("%s\n" % ovar)

    # write footer and output benchmark information.
    E.Stop()

コード例 #6

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--task", dest="task", type="choice",
                      choices=["merge_exclusions", "flag_hets",
                               "find_inbreds", "flag_relations",
                               "discordant_gender"],
                      help="task to execute on phenotype file(s)")

    parser.add_option("--gender-check-file", dest="gender_check",
                      type="string", help="output from gender checking "
                      "by Plink, suffix should be .sexcheck")

    parser.add_option("--relationship-file", dest="relations",
                      type="string", help="output file from IBS "
                      "calculation.  Should contain all pairwise "
                      "relationships.")

    parser.add_option("--inbreeding-coef-file", dest="inbreed_file",
                      type="string", help="file containing either Plink "
                      "or GCTA estimates of F, inbreeding coefficient")

    parser.add_option("--inbreeding-coefficient", dest="inbred_coeff", type="choice",
                      choices=["Fhat1", "Fhat2", "Fhat3", "F",
                               "ibc"], help="inbreeding coefficient "
                      "to use to identify highly inbred individuals")

    parser.add_option("--inbred-cutoff", dest="inbred_cutoff", type="float",
                      help="threshold above which individuals are classed "
                      "as inbred.")

    parser.add_option("--ibs-cutoff", dest="ibs_cutoff", type="float",
                      help="IBS threshold to flag individuals as being "
                      "closely related")

    parser.add_option("--trimmed-relationships", dest="rel_cutoff",
                      type="string", help="output file from Plink "
                      "--rel-cutoff with trimmed data set of unrelated "
                      "individuals.")

    parser.add_option("--heterozygotes-file", dest="hets_file", type="string",
                      help="file from heterozygote analysis containing observed "
                      "homozygosity and F coefficients")

    parser.add_option("--auxillary-file", dest="aux_file", type="string",
                      help="a file of IIDs and FIDs for individuals that are "
                      "to be removed from analysis, unrelated to QC")

    parser.add_option("--plotting-path", dest="plot_path", type="string",
                      help="PATH to save any plots to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.task == "flag_hets":
        # calculate heterozygosity rates, find and flag
        # individuals > 3 s.d. away from mean value
        # rate = (nonissing - homs) / nonmissing
        # i.e. non-homozygote rate
        flags = gwas.flagExcessHets(options.hets_file,
                                    plot=True,
                                    plot_path=options.plot_path)
        flags.to_csv(options.stdout, index=None, sep="\t")

    elif options.task == "merge_exclusions":
        exclusions = gwas.mergeQcExclusions(hets_file=options.hets_file,
                                            inbred_file=options.inbreed_file,
                                            related_file=options.relations,
                                            gender_file=options.gender_check,
                                            mask_file=options.aux_file)
        exclusions.to_csv(options.stdout, index=None, sep="\t")
    elif options.task == "find_inbreds":
        inbreds = gwas.flagInbred(inbred_file=options.inbreed_file,
                                  inbreeding_coefficient=options.inbred_coeff,
                                  ibc_threshold=options.inbred_cutoff,
                                  plot=True,
                                  plot_path=options.plot_path)
        inbreds.to_csv(options.stdout, sep="\t", index=None)
    elif options.task == "flag_relations":
        # the input file is likely to be huge! Ergo, read the file in chunks
        # calculate any related individuals and store them, store
        # an array of IBD values for plotting, drop the rest
        relate = gwas.flagRelated(ibd_file=options.relations,
                                  chunk_size=500000,
                                  threshold=options.ibs_cutoff,
                                  plot=True,
                                  plotting_path=options.plot_path)
    elif options.task == "discordant_gender":
        sex_discord = gwas.flagGender(gender_file=options.gender_check,
                                      plot=True,
                                      plot_path=options.plot_path)
        sex_discord.to_csv(options.stdout, index=None, sep="\t")
    else:
        pass

    # write footer and output benchmark information.
    E.Stop()

コード例 #7

ファイルを表示

ファイル: pheno2plot.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("-p",
                      "--plot-type",
                      dest="plot_type",
                      type="choice",
                      choices=[
                          "histogram", "barplot", "density", "boxplot",
                          "scatter", "map", "pca"
                      ],
                      help="the plot type to generate")

    parser.add_option("--plot-n-pc",
                      dest="n_pcs",
                      type="int",
                      help="The number of principal components to "
                      "plot")

    parser.add_option("-g",
                      "--group-by",
                      dest="group_by",
                      type="string",
                      help="column header to group observations by")

    parser.add_option("-x",
                      "--x-column",
                      dest="x_col",
                      type="string",
                      help="column to plot on X axis")

    parser.add_option("-y",
                      "--y-column",
                      dest="y_col",
                      type="string",
                      help="column to plot on y axis")

    parser.add_option("-i",
                      "--index_column",
                      dest="indx",
                      type="string",
                      help="column number that refers to the row index")

    parser.add_option("--output-file",
                      dest="outfile",
                      type="string",
                      help="path and filename to save plot to")

    parser.add_option("--labels",
                      dest="labels",
                      type="string",
                      help="a comma-separated list of axis labels. "
                      "The first 2 correspond to the X and Y-axis, "
                      "respectively, and the third is the plot title")

    parser.add_option("--metadata-file",
                      dest="meta_file",
                      type="string",
                      help="file containing metadata for annotating "
                      "plots with. Use `--group-labels` to define table "
                      "columns to use")

    parser.add_option("--fam-file",
                      dest="fam_file",
                      type="string",
                      help="Plink .fam file containing file IDs")

    parser.add_option("--xvar-labels",
                      dest="xvar_labs",
                      type="string",
                      help="a comma-separated list of variable X labels"
                      "only applies when X is a discrete or categorical "
                      "variable. The labels must be in the correct order")

    parser.add_option("--group-labels",
                      dest="group_labs",
                      type="string",
                      help="a comma-separated list of grouping variable "
                      "labels.  Can only apply when the grouping variable "
                      "is discrete or categorical.  The labels must be "
                      "input in the order of the data")

    parser.add_option("--yvar-labels",
                      dest="yvar_labs",
                      type="string",
                      help="a comma-separated list of variable Y labels"
                      "only applies when Y is a discrete or categorical "
                      "variable")

    parser.add_option("--var-type",
                      dest="var_type",
                      type="choice",
                      choices=["continuous", "categorical", "integer"],
                      help="The data type of the variables to be plotted."
                      "The default is continuous")

    parser.add_option("--coordinate-file",
                      dest="coordinates",
                      type="string",
                      help="file containing co-ordinates data")

    parser.add_option("--coords-id-col",
                      dest="coord_ids",
                      type="string",
                      help="column header containing individual IDs")

    parser.add_option("--lattitude-column",
                      dest="lat_col",
                      type="string",
                      help="column header containing lattitude co-ordinates")

    parser.add_option("--longitude-column",
                      dest="long_col",
                      type="string",
                      help="column header containing longitude co-ordinates")

    parser.add_option("--reference-value",
                      dest="ref_val",
                      type="string",
                      help="categorical variable level to dichotomise on")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(y_col=None,
                        group_by=None,
                        indx=None,
                        labels="X,Y,title",
                        xvar_labs=None,
                        yvar_labs=None,
                        var_type="continuous")
    infile = argv[-1]

    df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

    if options.plot_type == "map":
        df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

        coords_df = pd.read_table(options.coordinates,
                                  sep="\t",
                                  header=0,
                                  index_col=options.indx)
        gwas.plotMapPhenotype(data=df,
                              coords=coords_df,
                              coord_id_col=options.coord_ids,
                              lat_col=options.lat_col,
                              long_col=options.long_col,
                              save_path=options.outfile,
                              xvar=options.x_col,
                              var_type=options.var_type,
                              xlabels=options.xvar_labs,
                              level=options.ref_val)

    elif options.plot_type == "pca":
        data = gwas.parseFlashPCA(pcs_file=infile, fam_file=options.fam_file)

        gwas.plotPCA(data=data,
                     nPCs=options.n_pcs,
                     point_labels=options.group_labs,
                     save_path=options.outfile,
                     headers=False,
                     metadata=options.meta_file,
                     multiplot=True)
    else:
        df = pd.read_table(infile, sep="\t", index_col=options.indx, header=0)

        gwas.plotPhenotype(data=df,
                           plot_type=options.plot_type,
                           x=options.x_col,
                           y=options.y_col,
                           group=options.group_by,
                           save_path=options.outfile,
                           labels=options.labels,
                           xlabels=options.xvar_labs,
                           ylabels=options.yvar_labs,
                           glabels=options.group_labs,
                           var_type=options.var_type)

    # write footer and output benchmark information.
    E.Stop()

コード例 #8

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--program",
                      dest="program",
                      type="choice",
                      choices=["plink2", "gcta", "plinkdev"],
                      help="program to execute genome-wide analysis")

    parser.add_option("--input-file-pattern",
                      dest="infile_pattern",
                      type="string",
                      help="file prefix that identifies a group of files")

    parser.add_option("--input-file-format",
                      dest="file_format",
                      type="choice",
                      choices=[
                          "plink", "plink_binary", "oxford", "oxford_binary",
                          "vcf", "GRM_binary", "GRM_gz", "GRM_plink"
                      ],
                      help="format of input files")

    parser.add_option("--phenotypes-file",
                      dest="pheno_file",
                      type="string",
                      help="text file of additional phenotypes")

    parser.add_option("--pheno",
                      dest="pheno",
                      type="string",
                      help="either phenotype file column header or number")

    parser.add_option("--covariates-file",
                      dest="covariate_file",
                      type="string",
                      help="file containing covariates.  Used as the "
                      "continuous covariates in GCTA-based analyses")

    parser.add_option("--covariate-column",
                      dest="covar_col",
                      type="string",
                      help="column number(s) or header(s) to include in "
                      "association model")

    parser.add_option("--discrete-covariates-file",
                      dest="covariate_discrete",
                      type="string",
                      help="file containing discrete covariates "
                      "to adjust for in GCTA-based analyses")

    parser.add_option("--association-model",
                      dest="assoc_model",
                      type="choice",
                      choices=["recessive", "dominant", "genotype"],
                      help="model to report from association analysis")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=[
                          "association", "summary", "format", "matrix", "reml",
                          "bivariate_reml", "pca", "lmm", "simulation",
                          "epistasis", "ld", "estimate_haplotypes"
                      ],
                      help="method to apply to genome-wide data")

    parser.add_option("--reml-method",
                      dest="reml_method",
                      type="choice",
                      choices=[
                          "standard", "priors", "reml_algorithm",
                          "unconstrained", "GxE", "LRT", "BLUP_EBV", "snpBLUP",
                          "no_residual", "fixed_cor"
                      ],
                      help="method for REML estimate of heritability method "
                      "including either single or dual phenotypes")

    parser.add_option("--reml-parameters",
                      dest="reml_param",
                      type="string",
                      help="comma separated list of parameters to pass to "
                      "REML variance components analysis")

    parser.add_option("--prevalence",
                      dest="prevalence",
                      type="float",
                      help="binary trait prevalence in a cohort study. "
                      "Used to estimate h2 on the liability threshold "
                      "scale.")

    parser.add_option("--lmm-method",
                      dest="lmm_method",
                      type="choice",
                      choices=["standard", "loco", "no_covar"],
                      help="type of linear mixed model analysis to run")

    parser.add_option("--grm-prefix",
                      dest="grm_prefix",
                      type="string",
                      help="prefix of the pre-computed GRM files to use "
                      "in the linear mixed model analysis")

    parser.add_option(
        "--epistasis-method",
        dest="epi_method",
        type="choice",
        choices=["fast_epistasis", "epistasis", "two_locus", "adjusted"],
        help="epistasis method to use")

    parser.add_option("--epistasis-parameter",
                      dest="epi_param",
                      type="string",
                      help="modifiers of epistasis functions")

    parser.add_option("--epistasis-threshold",
                      dest="epi_sig",
                      type="string",
                      help="statistical significance threshold for counting "
                      "interactions")

    parser.add_option("--epistasis-report-threshold",
                      dest="epi_report",
                      type="string",
                      help="threshold used to count the "
                      "proportion of statistically significant interactions")

    parser.add_option("--set-file",
                      dest="set_file",
                      type="string",
                      help="file containing variant sets as per Plink "
                      ".set file specification")

    parser.add_option("--set-method",
                      dest="set_method",
                      type="choice",
                      choices=["set-by-all", "set-by-set"],
                      help="set method to use when `set_file` provided")

    parser.add_option("--principal-components",
                      dest="num_pcs",
                      type="int",
                      help="the number of principal components to output")

    parser.add_option("--matrix-shape",
                      dest="matrix_shape",
                      type="choice",
                      choices=["triangle", "square", "square0"],
                      help="output matrix shape.",
                      default="triangle")

    parser.add_option("--matrix-compression",
                      dest="matrix_compress",
                      type="choice",
                      choices=["gz", "bin", "bin4"],
                      help="compression to apply to output matrix")

    parser.add_option("--matrix-form",
                      dest="matrix_form",
                      type="choice",
                      choices=["distance", "grm"],
                      help="type of relationship matrix to calculate")

    parser.add_option(
        "--matrix-metric",
        dest="matrix_metric",
        type="choice",
        choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"],
        help="value to calculate for diagonal elements of the "
        "grm. Default is fhat for grm and hamming for distance.")

    parser.add_option(
        "--matrix-options",
        dest="matrix_options",
        type="string",
        help="modifiers of matrix output, see plink documentation "
        "for details")

    parser.add_option("--association-method",
                      dest="assoc_method",
                      type="choice",
                      choices=["linear", "logistic", "assoc", "qassoc"],
                      help="association analysis to run")

    parser.add_option(
        "--permutation",
        dest="permutation",
        action="store_true",
        help="perform association testing by permutation analysis")

    parser.add_option("--repeats",
                      dest="n_perms",
                      type="int",
                      help="number of repetitions for permutation analysis")

    parser.add_option("--association-options",
                      dest="assoc_option",
                      type="string",
                      help="association analysis modifiers")

    parser.add_option("--format-method",
                      dest="format_method",
                      type="choice",
                      choices=[
                          "change_format", "change_missing_values",
                          "update_variants", "update_samples", "flip_strands",
                          "flip_scan", "sort", "merge", "find_duplicates"
                      ],
                      help="file formatting to apply to input files")

    parser.add_option("--format-parameter",
                      dest="format_param",
                      type="string",
                      help="formatting parameter, where appropriate")

    parser.add_option(
        "--reformat-type",
        dest="reformat",
        type="choice",
        choices=["plink", "plink_binary", "oxford", "oxford_binary", "raw"],
        help="new format of input files to be reformatted to")

    parser.add_option("--apply-missing",
                      dest="apply_missing",
                      type="choice",
                      choices=["genotype", "phenotype"],
                      help="genotype or phenotype missing values to alter")

    parser.add_option("--update-variant-attribute",
                      dest="variant_update",
                      type="choice",
                      choices=[
                          "variant_ids", "missing_id", "chromosome",
                          "centimorgan", "name", "alleles", "map"
                      ],
                      help="update variant attributes")

    parser.add_option("--update-sample-attribute",
                      dest="sample_update",
                      type="choice",
                      choices=["sample_ids", "parents", "gender"],
                      help="sample attributes to be updated")

    parser.add_option("--strand-flip-subset",
                      dest="flip_subset",
                      action="store_true",
                      help="apply strand flipping to a subset of samples")

    parser.add_option("--flip-scan-type",
                      dest="scan_param",
                      type="choice",
                      choices=["default", "window", "threshold"],
                      help="strand flipping scan to apply to SNPs")

    parser.add_option("--sort-type",
                      dest="sort_type",
                      type="choice",
                      choices=["none", "natural", "ascii", "file"],
                      help="sort type to input files")

    parser.add_option("--merge-file-format",
                      dest="merge_format",
                      type="choice",
                      choices=["plink", "plink_binary"],
                      help="format of input files to be merged")

    parser.add_option(
        "--merge-mode",
        dest="merge_mode",
        type="choice",
        choices=[
            "default", "original_missing", "new_nonmissing", "no_overwrite",
            "force", "report_all", "report_nonmissing"
        ],
        help="merge mode to apply to dealing with merge conflicts")

    parser.add_option("--duplicates-method",
                      dest="dup_method",
                      type="choice",
                      choices=["same_ref", "id_match", "suppress_first"],
                      help="method for identifying and dealing with duplicate "
                      "variants")

    parser.add_option("--summary-method",
                      dest="summary_method",
                      type="choice",
                      choices=[
                          "allele_frequency", "missing_data", "hardy_weinberg",
                          "mendel_errors", "inbreeding", "gender_checker",
                          "wrights_fst", "case_control_fst"
                      ],
                      help="summary statistics to calculate")

    parser.add_option("--summary-parameter",
                      dest="sum_param",
                      type="string",
                      help="optional parameters that can be passed to summary "
                      "statistics methods")

    parser.add_option("--haplotype-frequency",
                      dest="filt_haplotype_frequency",
                      type="string",
                      help="min allele frequency for SNPs to be "
                      "considered for a haplotype")

    parser.add_option("--haplotype-size",
                      dest="filt_haplotype_size",
                      type="string",
                      help="maximum genomic size of "
                      "haplotypes")

    parser.add_option("--ld-statistic",
                      dest="ld_stat",
                      type="choice",
                      choices=["r", "r2"],
                      help="compute either the raw "
                      "inter variant allele count correlation, R, or the "
                      "squared correlation, R^2")

    parser.add_option("--ld-min",
                      dest="ld_min",
                      type="string",
                      help="minimum value to report for pair-wise LD "
                      "calculations.  Beware output files may be very "
                      "large if `ld_min` is very small.")

    parser.add_option("--ld-window",
                      dest="ld_window",
                      type="string",
                      help="distance between SNPs, beyond which LD will "
                      "not be calculated")

    parser.add_option("--ld-format-output",
                      dest="ld_shape",
                      type="choice",
                      choices=["square", "table", "triangle", "square0"],
                      help="output either as table, or matrix format with a "
                      "specific shape.")

    parser.add_option(
        "--genotype-rate",
        dest="filt_genotype_rate",
        type="string",
        help="genotyping rate threshold.  SNPs below this threshold "
        "will be excluded from analysis")

    parser.add_option("--indiv-missing",
                      dest="filt_missingness",
                      type="string",
                      help="individual missingness rate.  Individuals below "
                      "this threshold will be excluded from analysis")

    parser.add_option("--hardy-weinberg",
                      dest="filt_hwe",
                      type="string",
                      help="hardy-weinberg p-value threshold for SNPs.  SNPs "
                      "with a 2df chisquared p-value below this will be "
                      "filtered out")

    parser.add_option(
        "--min-allele-frequency",
        dest="filt_min_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or above this threshold")

    parser.add_option(
        "--max-allele-frequency",
        dest="filt_max_allele_frequency",
        type="string",
        help="only include SNPs with an allele frequency equal to "
        "or below this threshold")

    parser.add_option(
        "--mendelian-error",
        dest="filt_mendelian_error",
        type="string",
        help="exclude individuals/trios with mendelian errors that "
        "exceed this value")

    parser.add_option("--keep-individuals",
                      dest="filt_keep",
                      type="string",
                      help="a file containing individuals IDs to keep, "
                      "one per row")

    parser.add_option("--remove-individuals",
                      dest="filt_remove",
                      type="string",
                      help="a file of individual IDs to remove, one per row")

    parser.add_option("--min-quality-score",
                      dest="filt_min_qaul_score",
                      type="string",
                      help="reset the minimum low bound of quality scores for "
                      "variants in a VCF file.  Default is 0")

    parser.add_option(
        "--max-quality-score",
        dest="filt_max_qual_score",
        type="string",
        help="reset the maximum upper bound of quality scores for "
        "a VCCF file.  Default is Inf")

    parser.add_option("--allow-no-gender",
                      dest="filt_allow_no_sex",
                      type="string",
                      help="allow individuals with gender missing")

    parser.add_option("--enforce-gender",
                      dest="filt_enforce_sex",
                      type="string",
                      help="only include individuals with non-missing gender "
                      "information")

    parser.add_option("--subset-filter",
                      dest="filt_subset_filter",
                      type="choice",
                      choices=[
                          "cases", "controls", "males", "females", "founders",
                          "nonfounders"
                      ],
                      help="only apply filters to the specific subset of "
                      "individuals supplied")

    parser.add_option(
        "--extract-snps",
        dest="filt_extract",
        type="string",
        help="text file of variant IDs to include in the analysis, "
        "ignoring all others")

    parser.add_option("--exclude-snps",
                      dest="filt_exclude",
                      type="string",
                      help="a file of variant IDs to exclude from analysis")

    parser.add_option("--restrict-chromosome",
                      dest="filt_chromosome",
                      type="string",
                      help="restict analysis to either a single chromosome, "
                      "or a comma-separated list of chromosomes")

    parser.add_option("--exclude-chromosomes",
                      dest="filt_exclude_chromosome",
                      type="string",
                      help="exclude all variants on these "
                      "chromosome(s)")

    parser.add_option(
        "--autosome-only",
        dest="filt_autosome",
        action="store_true",
        help="if present only autosomal variants will be analysed")

    parser.add_option(
        "--pseudo-autosome",
        dest="filt_pseudo_autosome",
        action="store_true",
        help="include on the pseudo-autosomal region of chromosome X")

    parser.add_option("--ignore-indels",
                      dest="filt_ignore_indels",
                      action="store_true",
                      help="only include bi-allelic single nucleotide "
                      "variants in analysis")

    parser.add_option(
        "--snp-range",
        dest="filt_snp_bp_range",
        type="string",
        help="comma separated list of from, to genome co-ordinates "
        "within which to include variants for analysis")

    parser.add_option(
        "--conditional-snp",
        dest="filt_conditional_snp",
        type="string",
        help="condition the analysis on this SNP ID.  Can only be "
        "used in the linear and logistic regression models.")

    parser.add_option("--snp-id-range",
                      dest="filt_snp_id_range",
                      type="string",
                      help="comma separate list of IDs from, to within which "
                      "to include variants for analysis.")

    parser.add_option("--snp-id",
                      dest="filt_specific_snp",
                      type="string",
                      help="include a single snp in the analysis given by "
                      "it's variant ID.")

    parser.add_option("--exclude-variant",
                      dest="filt_exclude_snp",
                      type="string",
                      help="exclude a single variant from the analysis, "
                      "given by it's variant ID")

    parser.add_option(
        "--covariate-filter",
        dest="filt_covariate_filter",
        type="string",
        help="covariate column headers or column numbers on which "
        "to filter on. Requries --covariate-file")

    parser.add_option(
        "--filter-parameter",
        dest="param",
        type="string",
        help="parameter values to be passed to filtering function")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="string",
                      help="alters the behaviour of the --snp-range and "
                      "--include/exclude snp options.  variants within +/- "
                      "half * window_size (kb) are included")

    parser.add_option(
        "--range-resolution",
        dest="filt_range_resolution",
        type="choice",
        choices=["bp", "kb", "mb"],
        help="alters the (from, to) range resolution to either bp, "
        "kb or mb")

    parser.add_option(
        "--output-file-pattern",
        dest="out_pattern",
        type="string",
        help="output file pattern prefix. file suffixes are dependent "
        "on the task executed")

    parser.add_option("--threads",
                      dest="threads",
                      type="int",
                      help="the number of threads to use for multi-threaded "
                      "processes")

    parser.add_option("--memory",
                      dest="memory",
                      type="string",
                      help="amount of memory to reserve for the task")

    parser.add_option("--parallel",
                      dest="parallel",
                      type="int",
                      help="number of jobs to split task into")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(sum_param=None,
                        dup_method="same_ref",
                        n_perms=None,
                        permutation=False,
                        matrix_shape="triangle",
                        matrix_options=None,
                        matrix_compress="gz",
                        random_seed=random.randint(0, 19999),
                        sample_update=None,
                        memory="60G",
                        parallel=None,
                        covariate_file=None,
                        covar_col=None,
                        epi_report=0.001,
                        epi_sig=0.001)

    if not options.infile_pattern:
        infiles = (argv[-1]).split(",")
    else:
        infiles = options.infile_pattern

    # create a new filegroup object
    geno_files = gwas.FileGroup(files=infiles,
                                file_format=options.file_format,
                                genotype_format="imputed")
    if options.pheno_file:
        geno_files.set_phenotype(pheno_file=options.pheno_file,
                                 pheno=options.pheno)
    else:
        pass

    # add FileGroup object to the gwas program object
    if options.program == "plink2":
        gwas_object = gwas.Plink2(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "plinkdev":
        gwas_object = gwas.PlinkDev(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    elif options.program == "gcta":
        gwas_object = gwas.GCTA(files=geno_files)
        gwas_object.program_call(infiles=geno_files,
                                 outfile=options.out_pattern)
    else:
        pass

    # collect filtering options from options
    opt_dict = options.__dict__
    filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)]
    filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]}

    # iteratively add genotype filters to GWASProgram object
    for fkey in filter_dict:
        filt_key = fkey.lstrip("filt_")
        filter_value = filter_dict[fkey]
        gwas_object.apply_filters(filter_type=filt_key,
                                  filter_value=filter_value)

    # handle summary statistics
    if options.method == "summary":
        if options.summary_method == "allele_frequency":
            gwas_object._output_statistics(allele_frequency=options.sum_param)
        elif options.summary_method == "hardy_weinberg":
            gwas_object._output_statistics(hardy_weinberg=options.sum_param)
        elif options.summary_method == "missing_data":
            gwas_object._output_statistics(missing_data=options.sum_param)
        elif options.summary_method == "mendel_errors":
            gwas_object._output_statistics(mendel_errors=options.sum_param)
        elif options.summary_method == "inbreeding":
            gwas_object._output_statistics(inbreeding=options.sum_param)
        elif options.summary_method == "gender_checker":
            gwas_object._output_statistics(gender_checker=options.sum_param)
        elif options.summary_method == "wrights_fst":
            gwas_object._output_statistics(wrights_fst=options.sum_param)
        elif options.summary_method == "case_control_fst":
            gwas_object._output_statistics(case_control_fst=options.sum_param)
        else:
            pass
    elif options.method == "pca":
        gwas_object.PCA(n_pcs=options.num_pcs)
    elif options.method == "ld":
        gwas_object.calc_ld(ld_statistic=options.ld_stat,
                            ld_threshold=float(options.ld_min),
                            ld_shape=options.ld_shape)
    elif options.method == "association":
        gwas_object.run_association(association=options.assoc_method,
                                    permutation=options.permutation,
                                    n_perms=options.n_perms,
                                    random_seed=options.random_seed,
                                    covariates_file=options.covariate_file,
                                    covariates=options.covar_col)
    elif options.method == "estimate_haplotypes":
        gwas_object._run_tasks(estimate_haplotypes="haplotype")
    elif options.method == "lmm":
        print options.lmm_method
        gwas_object.mixed_model(lmm_method=options.lmm_method,
                                grm=options.grm_prefix,
                                qcovar=options.covariate_file,
                                dcovar=options.covariate_discrete)
    elif options.method == "epistasis":
        gwas_object._detect_interactions(
            method=options.epi_method,
            modifier=options.epi_param,
            set_file=options.set_file,
            set_mode=options.set_method,
            report_threshold=options.epi_report,
            sig_threshold=options.epi_sig,
            covariates_file=options.covariate_file,
            covariates=options.covar_col)
    elif options.method == "reml":
        gwas_object.reml_analysis(method=options.reml_method,
                                  parameters=options.reml_param,
                                  prevalence=options.prevalence,
                                  qcovariates=options.covariate_file,
                                  discrete_covar=options.covariate_discrete)
    elif options.method == "format":
        if options.format_method == "change_format":
            # adding filtering options to plink requires the --make-bed flag
            try:
                update_samples = opt_dict["sample_update"]
                if update_samples:
                    E.info("updating samples from %s" % options.format_param)
                    gwas_object._run_tasks(change_format=options.reformat,
                                           parameter=options.format_param)
                    gwas_object._run_tasks(
                        update_samples=options.sample_update,
                        parameter=options.format_param)
                else:
                    gwas_object._run_tasks(change_format=options.reformat,
                                           parameter=options.format_param)
            except KeyError:
                gwas_object._run_tasks(change_format=options.reformat,
                                       parameter=options.format_param)
        elif options.format_method == "change_missing_values":
            gwas_object._run_tasks(change_missing_values=options.apply_missing,
                                   parameter=options.format_param)
        elif options.format_method == "update_variants":
            gwas_object._run_tasks(update_variants=options.variant_update,
                                   parameter=options.format_param)
            gwas_object._run_tasks(change_format=options.file_format)
        elif options.format_method == "update_samples":
            gwas_object._run_tasks(update_samples=options.sample_update,
                                   parameter=options.format_param)
        elif options.format_method == "flip_strands":
            if options.flip_subset:
                gwas_object._run_tasks(flip_strands="subset",
                                       parameter=options.format_param)
            else:
                gwas_object._run_tasks(flip_strands="all_samples",
                                       parameter=options.format_param)
        elif options.format_method == "flip_scan":
            gwas_object._run_tasks(flip_scan=options.scan_param,
                                   parameter=options.format_param)
        elif options.format_method == "sort":
            gwas_object._run_tasks(sort=options.sort_type,
                                   parameter=options.format_param)
        elif options.format_method == "merge":
            if options.merge_mode:
                gwas_object._run_tasks(merge_mode=options.merge_mode,
                                       parameter=options.format_param)
            else:
                gwas_object._run_tasks(merge=options.merge_format,
                                       parameter=options.format_param)
        elif options.format_method == "find_duplicates":
            gwas_object._run_tasks(find_duplicates=options.dup_method,
                                   parameter=options.format_param)
        else:
            pass
    elif options.method == "matrix":
        if options.matrix_form == "distance":
            if options.matrix_metric == "hamming":
                gwas_object.hamming_matrix(shape=options.matrix_shape,
                                           compression=options.matrix_compress,
                                           options=options.matrix_options)
            elif options.matrix_metric == "ibs":
                gwas_object.ibs_matrix(shape=options.matrix_shape,
                                       compression=options.matrix_compress,
                                       options=options.matrix_options)
            elif options.matrix_metric == "genomic":
                gwas_object.genome_matrix(shape=options.matrix_shape,
                                          compression=options.matrix_compress,
                                          options=options.matrix_options)
        elif options.matrix_form == "grm":
            gwas_object.genetic_relationship_matrix(
                shape=options.matrix_shape,
                compression=options.matrix_compress,
                metric=options.matrix_metric,
                options=options.matrix_options)
    else:
        pass

    gwas_object.build_statement(infiles=geno_files,
                                outfile=options.out_pattern,
                                threads=options.threads,
                                memory=options.memory,
                                parallel=options.parallel)

    # write footer and output benchmark information.
    E.Stop()

コード例 #9

ファイルを表示

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--plot-type", dest="plot_type", type="choice",
                      choices=["manhattan", "qqplot", "epistasis"],
                      help="plot type to generate")

    parser.add_option("--resolution", dest="resolution", type="choice",
                      choices=["genome_wide", "chromosome",
                               "fine_map"],
                      help="the resolution of plotting, wether the plot "
                      "depicts the whole genome, a single chromosome or "
                      "a specific locus")

    parser.add_option("--save-path", dest="save_path", type="string",
                      help="path and filename to save image to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    parser.set_defaults(resolution="genome_wide",
                        plot_type="manhattan")

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")
    
    # need to parse epistasis output slightly differently
    if options.plot_type == "epistasis":
        epi = True
    else:
        epi = False

    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles,
                                   epistasis=epi)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile,
                                   epistasis=epi)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.plot_type == "manhattan":
        df = results.plotManhattan(resolution=options.resolution,
                                   save_path=options.save_path)
    elif options.plot_type == "qqplot":
        results.plotQQ(save_path=options.save_path,
                       resolution=options.resolution)
    elif options.plot_type == "epistasis":
        results.plotEpistasis(save_path=options.save_path,
                              resolution=options.resolution)
    else:
        pass

    # only output appended results for Manhattan plot, not qqplot
    try:
        df.to_csv(options.stdout, sep="\t", index_col=None)
    except UnboundLocalError:
        pass

    # write footer and output benchmark information.
    E.Stop()

コード例 #10

ファイルを表示

ファイル: snps2architecture.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=["cases_explained", "probability_phenotype"],
                      help="Which results to report, either the proportion "
                      "of cases explained or the probability of the "
                      "phenotype given the number of alleles carried")

    parser.add_option("--map-file",
                      dest="map_file",
                      type="string",
                      help="plink .map file with SNP positions")

    parser.add_option("--ped-file",
                      dest="ped_file",
                      type="string",
                      help="plink ped file with phenotype and "
                      "genotype data - A2 major allele coded")

    parser.add_option("--gwas-file",
                      dest="gwas",
                      type="string",
                      help="gwas results file, assumes Plink "
                      "output format.  Must contain SNP, BP, "
                      "OR column headers.  Assumes results relate "
                      "to the A1 allele")

    parser.add_option("--flip-alleles",
                      dest="flip",
                      action="store_true",
                      help="force alleles to flip if OR < 1")

    parser.add_option("--plot-statistic",
                      dest="plot_stat",
                      type="choice",
                      choices=["frequency", "cumulative"],
                      help="plot either cases frequency or cumulative "
                      "frequency of cases")

    parser.add_option("--plot-path",
                      dest="plot_path",
                      type="string",
                      help="save path for plot")

    parser.add_option("--flag-explained-recessive",
                      dest="explained",
                      action="store_true",
                      help="flag individuals explained by carriage of "
                      "2 risk alleles - NOT IMPLIMENTED")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # required files are .ped file, .map file and gwas results file
    E.info("reading GWAS results file: %s" % options.gwas)
    snp_df = pd.read_table(options.gwas, sep="\t", header=0, index_col=None)
    snp_list = snp_df["SNP"].values

    # parse ped file
    E.info("Reading ped file: %s" % options.ped_file)
    ped_df = gwas.parsePed(options.ped_file, compound_geno=True)

    # parse map file and get SNP indices that correspond to
    # ped file genotypes
    E.info("Fetching SNPs from map file: %s" % options.map_file)
    snp_index = gwas.getSNPs(options.map_file, snp_list)

    E.info("SNPs found: %i" % len(snp_index))
    # extract SNPs and ORs as key, value pairs
    or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list')
    snp_or = dict(zip(or_dict["SNP"], or_dict["OR"]))

    if options.flip:
        E.info("Flipping major alleles to risk alleles")
        flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index,
                                             snp_results=snp_or,
                                             genos=ped_df["GENOS"].tolist())
        # merge flipped genotypes with pedigree frame to get phenotypes
        geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"])
    else:
        # split genos into a dataframe
        genos = np.array(ped_df["GENOS"].tolist())
        geno_df = pd.DataFrame(genos, index=ped_df["FID"])

    merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID")

    # need to discount missing genotypes > 1%

    # frequencies of number of risk alleles by trait frequency
    E.info("count #risk alleles per individual")
    risk_results = gwas.countRiskAlleles(ped_frame=merged,
                                         snp_index=snp_index.values(),
                                         report=options.method,
                                         flag=options.explained)
    risk_freqs = risk_results["freqs"]
    cumulative = risk_results["cumulative"]
    # select results upto and including cumulative freq = 1.0
    max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0]
    max_freqs = risk_freqs[:max_indx + 1]
    max_cum = cumulative[:max_indx + 1]
    bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1]

    # plot!
    if options.plot_stat == "frequency":
        E.info("Generating plot of #risk alleles vs. P(Phenotype)")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_freqs,
                                         savepath=options.plot_path,
                                         ytitle="P(Phenotype)")
    elif options.plot_stat == "cumulative":
        E.info("Generating plot of #risk alleles vs. cumulative frequency")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_cum,
                                         savepath=options.plot_path,
                                         ytitle="Cumulative frequency cases")

    hist_df["freq"] = risk_results["freqs"][:max_indx + 1]
    hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1]
    hist_df["cases"] = risk_results["cases"][:max_indx + 1]
    hist_df["controls"] = risk_results["controls"][:max_indx + 1]
    hist_df["total"] = hist_df["cases"] + hist_df["controls"]
    hist_df.to_csv(options.stdout, sep="\t", index=None)

    # write footer and output benchmark information.
    E.Stop()

コード例 #11

ファイルを表示

ファイル: snps2architecture.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--method", dest="method", type="choice",
                      choices=["cases_explained", "probability_phenotype"],
                      help="Which results to report, either the proportion "
                      "of cases explained or the probability of the "
                      "phenotype given the number of alleles carried")

    parser.add_option("--map-file", dest="map_file", type="string",
                      help="plink .map file with SNP positions")

    parser.add_option("--ped-file", dest="ped_file", type="string",
                      help="plink ped file with phenotype and "
                      "genotype data - A2 major allele coded")

    parser.add_option("--gwas-file", dest="gwas", type="string",
                      help="gwas results file, assumes Plink "
                      "output format.  Must contain SNP, BP, "
                      "OR column headers.  Assumes results relate "
                      "to the A1 allele")

    parser.add_option("--flip-alleles", dest="flip", action="store_true",
                      help="force alleles to flip if OR < 1")

    parser.add_option("--plot-statistic", dest="plot_stat", type="choice",
                      choices=["frequency", "cumulative"],
                      help="plot either cases frequency or cumulative "
                      "frequency of cases")

    parser.add_option("--plot-path", dest="plot_path", type="string",
                      help="save path for plot")

    parser.add_option("--flag-explained-recessive", dest="explained",
                      action="store_true",
                      help="flag individuals explained by carriage of "
                      "2 risk alleles - NOT IMPLIMENTED")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # required files are .ped file, .map file and gwas results file
    E.info("reading GWAS results file: %s" % options.gwas)
    snp_df = pd.read_table(options.gwas, sep="\t", header=0,
                           index_col=None)
    snp_list = snp_df["SNP"].values

    # parse ped file
    E.info("Reading ped file: %s" % options.ped_file)
    ped_df = gwas.parsePed(options.ped_file,
                           compound_geno=True)

    # parse map file and get SNP indices that correspond to
    # ped file genotypes
    E.info("Fetching SNPs from map file: %s" % options.map_file)
    snp_index = gwas.getSNPs(options.map_file,
                             snp_list)

    E.info("SNPs found: %i" % len(snp_index))
    # extract SNPs and ORs as key, value pairs
    or_dict = snp_df.loc[:, ["SNP", "OR"]].to_dict(orient='list')
    snp_or = dict(zip(or_dict["SNP"], or_dict["OR"]))

    if options.flip:
        E.info("Flipping major alleles to risk alleles")
        flipped_genos = gwas.flipRiskAlleles(snp_index=snp_index,
                                             snp_results=snp_or,
                                             genos=ped_df["GENOS"].tolist())
        # merge flipped genotypes with pedigree frame to get phenotypes
        geno_df = pd.DataFrame(flipped_genos, index=ped_df["FID"])
    else:
        # split genos into a dataframe
        genos = np.array(ped_df["GENOS"].tolist())
        geno_df = pd.DataFrame(genos, index=ped_df["FID"])

    merged = pd.merge(geno_df, ped_df, left_index=True, right_on="FID")

    # need to discount missing genotypes > 1%

    # frequencies of number of risk alleles by trait frequency
    E.info("count #risk alleles per individual")
    risk_results = gwas.countRiskAlleles(ped_frame=merged,
                                         snp_index=snp_index.values(),
                                         report=options.method,
                                         flag=options.explained)
    risk_freqs = risk_results["freqs"]
    cumulative = risk_results["cumulative"]
    # select results upto and including cumulative freq = 1.0
    max_indx = [fx for fx, fy in enumerate(cumulative) if fy == 1.0][0]
    max_freqs = risk_freqs[:max_indx + 1]
    max_cum = cumulative[:max_indx + 1]
    bins = [ix for ix, iy in enumerate(cumulative)][:max_indx + 1]

    # plot!
    if options.plot_stat == "frequency":
        E.info("Generating plot of #risk alleles vs. P(Phenotype)")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_freqs,
                                         savepath=options.plot_path,
                                         ytitle="P(Phenotype)")
    elif options.plot_stat == "cumulative":
        E.info("Generating plot of #risk alleles vs. cumulative frequency")
        hist_df = gwas.plotRiskFrequency(bins=bins,
                                         frequencies=max_cum,
                                         savepath=options.plot_path,
                                         ytitle="Cumulative frequency cases")

    hist_df["freq"] = risk_results["freqs"][:max_indx + 1]
    hist_df["cumulative"] = risk_results["cumulative"][:max_indx + 1]
    hist_df["cases"] = risk_results["cases"][:max_indx + 1]
    hist_df["controls"] = risk_results["controls"][:max_indx + 1]
    hist_df["total"] = hist_df["cases"] + hist_df["controls"]
    hist_df.to_csv(options.stdout, sep="\t", index=None)

    # write footer and output benchmark information.
    E.Stop()

コード例 #12

ファイルを表示

ファイル: snpPriority.py プロジェクト: MikeDMorgan/gwas_pipeline

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--score-method",
                      dest="method",
                      type="choice",
                      choices=[
                          "PICS", "LDscore", "ABF", "R2_rank", "get_eigen",
                          "calc_prior", "credible_set", "summarise"
                      ],
                      help="SNP scoring/prioritisation method to apply.")

    parser.add_option("--database",
                      dest="database",
                      type="string",
                      help="SQL database containing LD information "
                      "in table format. Expects columns SNP_A, "
                      "SNP_B, R2, BP_A and BP_B (Plink --r2 output)")

    parser.add_option("--ld-directory",
                      dest="ld_dir",
                      type="string",
                      help="directory containing tabix-index BGZIP "
                      "LD files.  Assumes Plink used to calculate LD")

    parser.add_option("--table-name",
                      dest="table",
                      type="string",
                      help="name of the SQL table containing the LD"
                      "values")

    parser.add_option("--chromosome",
                      dest="chromosome",
                      type="string",
                      help="chromosome to subset the association results "
                      "file on")

    parser.add_option("--ld-threshold",
                      dest="ld_threshold",
                      type="float",
                      help="the threshold of LD above which variants will "
                      "be taken forward.")

    parser.add_option("--rank-threshold",
                      dest="rank_threshold",
                      type="float",
                      help="the threshold in terms of the top n% SNPs to "
                      "output based on the ranking metric. e.g. "
                      "--rank-threshold=0.01 is the top 1% SNPs")

    parser.add_option("--credible-interval",
                      dest="interval",
                      type="float",
                      help="The credible set interval size to generate the "
                      "credible set of SNPs")

    parser.add_option("--prior-variance",
                      dest="prior_var",
                      type="float",
                      help="the prior variance used to weight the SNP "
                      "variance")

    parser.add_option("--fine-map-window",
                      dest="map_window",
                      type="int",
                      help="the region size to included around the index "
                      "SNP as the fine-mapping region.")

    parser.add_option("--eigen-score-directory",
                      dest="eigen_dir",
                      type="string",
                      help="PATH to directory containing tabix indexed "
                      "eigen score files")

    parser.add_option("--flat-prior",
                      dest="flat_prior",
                      action="store_true",
                      help="Ignore functional annotation information and "
                      "use an uninformative prior on each SNP")

    parser.add_option("--snp-set",
                      dest="snp_set",
                      type="string",
                      help="Pre-defined SNP set as a list of SNP IDs."
                      "If used to calculate priors contains column of scores.")

    parser.add_option(
        "--distribution",
        dest="dist",
        type="choice",
        choices=["normal", "t", "gamma", "lognormal", "exponential"],
        help="distribution from which to draw prior "
        "probabilities")

    parser.add_option("--distribution-parameters",
                      dest="dist_params",
                      type="string",
                      help="distribution parameters as a comma-separated list")

    parser.add_option("--lead-snp-id",
                      dest="lead_snp",
                      type="int",
                      help="0-based item number in filename")

    parser.add_option("--filename-separator",
                      dest="separator",
                      type="string",
                      help="filename separator to extract information")

    parser.add_option("--snp-column",
                      dest="snp_col",
                      type="int",
                      help="0-based index of SNP ID column number")

    parser.add_option("--probability-column",
                      dest="prob_col",
                      type="int",
                      help="0-based index of posterior probabilities column"
                      " number")

    parser.set_defaults(
        ld_dir=None,
        dist="normal",
        dist_params=None,
        snp_set=None,
        prior_var=0.04,
        interval=0.99,
        eigen_dir=None,
        map_window=100000,
        ld_threshold=0.5,
        database=None,
        table=None,
        flat_prior=False,
        lead_snp=2,
        separator="_",
        snp_col=0,
        prob_col=1,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]

    if len(infile.split(",")) > 1:
        pass
    else:
        peek = pd.read_table(infile, nrows=5, sep="\s*", header=0)
        try:
            if len(peek["TEST"] != "ADD"):
                clean = False
            else:
                clean = True
        except KeyError:
            clean = True

    if options.method == "LDscore":
        snpscores = gwas.snpPriorityScore(gwas_results=infile,
                                          database=options.database,
                                          table_name=options.table,
                                          chromosome=options.chromosome,
                                          ld_dir=options.ld_dir,
                                          clean=clean)
        # take top 1%, all SNPs doesn't achieve anything useful
        ranks = int(len(snpscores.index) * 0.01)
        snpscores = snpscores.iloc[:ranks]

    elif options.method == "PICS":
        snp_list = {}
        if options.snp_set and not options.flat_prior:
            with IOTools.openFile(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    try:
                        score = float(line.split("\t")[-1].rstrip("\n"))
                    except ValueError:
                        score = 0
                    snp_list[snp] = float(score)

            # get the parameter estimates for the distribution
            # if they have not been provided
            if not options.dist_params:
                dist_params = gwas.estimateDistributionParameters(
                    data=snp_list.values(), distribution=options.dist)
            else:
                dist_params = tuple(
                    [float(fx) for fx in options.dist_params.split(",")])

            E.info("Calculating priors on SNPs")
            priors = gwas.calcPriorsOnSnps(snp_list=snp_list,
                                           distribution=options.dist,
                                           params=dist_params)

        elif options.snp_set and options.flat_prior:
            with IOTools.openFile(options.snp_set, "r") as sfile:
                for line in sfile.readlines():
                    snp = line.split("\t")[0]
                    snp_list[snp] = 1.0

            priors = snp_list

        else:
            # allow for no priors or scores to be set,
            # use of priors will be ignored,
            # i.e. when prior and likelihood are not from
            # conjugate distributions
            priors = None

        # PICS scores expects the gwas results file to
        # only contain the region of interest, which
        # represents an independent association signal
        snpscores = gwas.PICSscore(gwas_results=infile,
                                   database=options.database,
                                   table_name=options.table,
                                   chromosome=options.chromosome,
                                   priors=priors,
                                   clean=clean,
                                   ld_dir=options.ld_dir,
                                   ld_threshold=options.ld_threshold)

        snpscores.columns = ["SNP", "PICS"]
        posterior_sum = 0
        snpscores.sort_values(ascending=False, inplace=True)
        post_snps = []
        for snp in snpscores.index:
            if posterior_sum < 99.0:
                posterior_sum += snpscores.loc[snp]
                post_snps.append(snp)
            else:
                break

        snpscores = snpscores.loc[post_snps]

        snpscores.drop_duplicates(inplace=True)

    elif options.method == "R2_rank":
        # rank SNPs based on their LD with the lead
        # SNP, take the top n% SNPs
        snpscores = gwas.LdRank(gwas_results=infile,
                                database=options.database,
                                table_name=options.table,
                                ld_dir=options.ld_dir,
                                chromosome=options.chromosome,
                                ld_threshold=options.ld_threshold,
                                top_snps=options.rank_threshold,
                                clean=clean)

    elif options.method == "ABF":
        snpscores = gwas.ABFScore(gwas_results=infile,
                                  region_size=options.map_window,
                                  chromosome=options.chromosome,
                                  prior_variance=options.prior_var,
                                  clean=clean)
    elif options.method == "get_eigen":
        E.info("Fetching Eigen scores")
        snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir,
                                        bim_file=infile,
                                        snp_file=options.snp_set)
        snpscores = pd.DataFrame(snpscores).T

    elif options.method == "credible_set":
        E.info("Creating credible set")

        snpscores = gwas.makeCredibleSet(probs_file=infile,
                                         credible_set=options.interval,
                                         lead_snp_indx=options.lead_snp,
                                         filename_sep=options.separator,
                                         snp_column=options.snp_col,
                                         probs_column=options.prob_col)

    elif options.method == "summarise":
        E.info("Collating SNP prioritisation resuslts")
        file_list = infile.split(",")
        snpscores = gwas.summariseResults(file_list=file_list)

    snpscores.to_csv(options.stdout, index_label="SNP", sep="\t")

    # write footer and output benchmark information.
    E.Stop()