Example #1
0
    def test_gwas_from_source(self):
        #full format, OR+SE (which is like beta+se)
        gwas_format = {
            "column_snp":"SNPID",
            "column_non_effect_allele":"A2",
            "column_effect_allele":"A1",
            "column_or":"OR",
            "column_se":"SE",
            "column_chromosome":"HG19CHRC",
            "column_position":"BP"
        }

        source = GWASUtilities.gwas_filtered_source("tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz")
        gwas = GWAS.load_gwas(source, gwas_format)
        assert_gwas_zscore_fbse(self, gwas)

        source = GWASUtilities.gwas_filtered_source("tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz", snps={"rs940550", "rs6650104", "rs61770173"}, snp_column_name="SNPID")
        gwas = GWAS.load_gwas(source, gwas_format)

        numpy.testing.assert_array_equal(gwas[SNP], pandas.Series(["rs940550", "rs6650104", "rs61770173", ], dtype=numpy.str))
        numpy.testing.assert_array_equal(gwas[EFFECT_ALLELE], pandas.Series(["C", "T",  "A"], dtype=numpy.str))
        numpy.testing.assert_array_equal(gwas[NON_EFFECT_ALLELE], pandas.Series(["G", "C", "C"], dtype=numpy.str))
        numpy.testing.assert_array_equal(gwas[CHROMOSOME], pandas.Series(["chr1", "chr1",  "chr22"], dtype=numpy.str))
        numpy.testing.assert_allclose(gwas[ZSCORE], pandas.Series([-1.254557, 0.974874, -0.232505],dtype=numpy.float32), rtol=0.001)
        numpy.testing.assert_allclose(gwas[BETA], pandas.Series([-0.0217038334437866, 0.0193025022544974, -0.00369682484428976], dtype=numpy.float32), rtol=0.001)
        numpy.testing.assert_allclose(gwas[SE], pandas.Series([0.0173, 0.0198,  0.0159], dtype=numpy.float32), rtol=0.001)
    def test_gwas_from_data(self):
        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1())
        assert_gwas_1(self, gwas)

        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_2())
        assert_gwas_2(self, gwas)

        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1_e(), extra_columns=[("number",6), ("character",7)])
        assert_gwas_1_e(self, gwas)
Example #3
0
    def test_gwas_from_source(self):
        #full format, OR+SE (which is like beta+se)
        gwas_format = {
            "column_snp": "SNPID",
            "column_non_effect_allele": "A2",
            "column_effect_allele": "A1",
            "column_or": "OR",
            "column_se": "SE",
            "column_chromosome": "HG19CHRC",
            "column_position": "BP"
        }

        source = GWASUtilities.gwas_filtered_source(
            "tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz")
        gwas = GWAS.load_gwas(source, gwas_format)
        assert_gwas_zscore_fbse(self, gwas)

        source = GWASUtilities.gwas_filtered_source(
            "tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz",
            snps={"rs940550", "rs6650104", "rs61770173"},
            snp_column_name="SNPID")
        gwas = GWAS.load_gwas(source, gwas_format)

        numpy.testing.assert_array_equal(
            gwas[SNP],
            pandas.Series([
                "rs940550",
                "rs6650104",
                "rs61770173",
            ],
                          dtype=numpy.str))
        numpy.testing.assert_array_equal(
            gwas[EFFECT_ALLELE], pandas.Series(["C", "T", "A"],
                                               dtype=numpy.str))
        numpy.testing.assert_array_equal(
            gwas[NON_EFFECT_ALLELE],
            pandas.Series(["G", "C", "C"], dtype=numpy.str))
        numpy.testing.assert_array_equal(
            gwas[CHROMOSOME],
            pandas.Series(["chr1", "chr1", "chr22"], dtype=numpy.str))
        numpy.testing.assert_allclose(gwas[ZSCORE],
                                      pandas.Series(
                                          [-1.254557, 0.974874, -0.232505],
                                          dtype=numpy.float32),
                                      rtol=0.001)
        numpy.testing.assert_allclose(
            gwas[BETA],
            pandas.Series([
                -0.0217038334437866, 0.0193025022544974, -0.00369682484428976
            ],
                          dtype=numpy.float32),
            rtol=0.001)
        numpy.testing.assert_allclose(gwas[SE],
                                      pandas.Series([0.0173, 0.0198, 0.0159],
                                                    dtype=numpy.float32),
                                      rtol=0.001)
    def test_gwas_from_data(self):
        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1())
        assert_gwas_1(self, gwas)

        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_2())
        assert_gwas_2(self, gwas)

        gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1_e(),
                                            extra_columns=[("number", 6),
                                                           ("character", 7)])
        assert_gwas_1_e(self, gwas)
Example #5
0
def build_betas(args, model, gwas_format, name):
    logging.info("Building beta for %s and %s", name,
                 args.model_db_path if args.model_db_path else "no database")
    load_from = os.path.join(args.gwas_folder, name)
    if model or args.skip_until_header:
        snps = model.snps() if model else None
        snp_column_name = args.snp_column if model else None
        load_from = GWASUtilities.gwas_filtered_source(
            load_from,
            snps=snps,
            snp_column_name=snp_column_name,
            skip_until_header=args.skip_until_header,
            separator=args.separator)
    sep = '\s+' if args.separator is None else args.separator
    b = GWAS.load_gwas(load_from,
                       gwas_format,
                       sep=sep,
                       input_pvalue_fix=args.input_pvalue_fix)

    if model is not None:
        PF = PredictionModel.WDBQF
        base = model.weights[[
            PF.K_RSID, PF.K_EFFECT_ALLELE, PF.K_NON_EFFECT_ALLELE
        ]].drop_duplicates()
        b = align_data_to_alleles(b, base, Constants.SNP, PF.K_RSID)

    b = b.fillna("NA")
    keep = [GWAS.SNP, GWAS.ZSCORE]
    if GWAS.BETA in b: keep.append(GWAS.BETA)
    b = b[keep]
    return b
def readGWAS(args):
    start = timer()
    validate(args)
    regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
    names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
    names.sort() #cosmetic, because different filesystems/OS yield folders in different order

    if len(names) == 0:
        msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
        raise Exceptions.ReportableException(msg)
    
    print "INFO: Reading GWAS data"
    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None
    model = None
    # dataframe
    r = pandas.DataFrame()
    for name in names:
        b = build_betas(args, model, gwas_format, name)
        r = pandas.concat([r,b])
    end = timer()
    logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    print("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    return r
Example #7
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(
            args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder,
                                                       regexp)
        names.sort(
        )  #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (
                args.gwas_folder,
                args.gwas_file_pattern,
            )
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(
        args.model_db_path,
        args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info(
                    "%s already exists, delete it if you want it to be done again",
                    output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %
                     (str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r, b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds" %
                     (str(end - start)))

        return r
Example #8
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
        names.sort() #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info("%s already exists, delete it if you want it to be done again", output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r,b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))

        return r
Example #9
0
                        help="weight dbs to be used")

    #GWAS betas
    parser.add_argument(
        "--gwas_folder",
        help=
        "name of folder containing GWAS data. All files in the folder are assumed to belong to a single study.",
        default="data/GWAS")

    parser.add_argument(
        "--gwas_file_pattern",
        help=
        "Pattern to recognice GWAS files in folders (in case there are extra files and you don't want them selected).",
        default=None)

    GWASUtilities.add_gwas_arguments_to_parser(parser)

    parser.add_argument(
        "--separator",
        help=
        "Character or string separating fields in input file. Defaults to any whitespace.",
        default=None)
    # Added to support GWAS utilities
    parser.add_argument(
        "--skip_until_header",
        help=
        "Some files may be malformed and contain unespecified bytes in the beggining."
        " Specify this option (string value) to identify a header up to which file contents should be skipped.",
        default=None)

    # ZScore calculation
Example #10
0
 def test_extract(self):
     gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_3())
     g = GWAS.extract(gwas, ["rs3", "rs6", "rs7"])
     assert_gwas_extracted_from_data_3(self, g)
Example #11
0
same organization. In these cases, users should pay careful attention to the
arguments --covariance_directory and --covariance_suffix. """)

    parser.add_argument("--model_db_snp_key", help="Specify a key to use as snp_id")

    parser.add_argument("-v", "--version", help="Report the version", action="store_true", default=False)
#weight db model
    parser.add_argument('weight_dbs', metavar='DB', type=argparse.FileType('r'), nargs='+', help="weight dbs to be used")

#GWAS betas
    parser.add_argument("--gwas_file", help="Load a single GWAS file. (Alternative to providing a gwas_folder and gwas_file_pattern)")

    parser.add_argument("--gwas_folder", help="name of folder containing GWAS data. All files in the folder are assumed to belong to a single study.")
    parser.add_argument("--gwas_file_pattern", help="Pattern to recognize GWAS files in folders (in case there are extra files and you don't want them selected).")

    GWASUtilities.add_gwas_arguments_to_parser(parser)

# ZScore calculation
    parser.add_argument("--stream_covariance", help="Option to better handle large covariances, slower but less memory consuming", action="store_true")
    parser.add_argument("--single_snp_model", action="store_true", help="Models are comprised of a single snp per gene", default=False)
    parser.add_argument("--covariance_directory", help="directory where covariance files can be found (or SAME if covariance sits beside the .db file", default="SAME")
    parser.add_argument("--covariance_suffix", help="Suffix associated with the covariance files. covext-dbext (where ..dbext is the portion of the db file to be replaced by the coviarance extension. )", default=".txt.gz.._0.5.db")
    parser.add_argument("--output_directory", help="name of output file", default="results")
    parser.add_argument("--output_file_prefix", help="name of output file", default="results")
    parser.add_argument("--additional_output", help="If set, will output additional information.", action="store_true", default=False)
    parser.add_argument("--remove_ens_version", help="If set, will keep the -version- postfix in gene id.", action="store_true", default=False)
    parser.add_argument("--overwrite", help="If set, will overwrite the results file if it exists.", action="store_true", default=False)
    parser.add_argument("--verbosity", help="Log verbosity level. 1 is everything being logged. 10 is only high level messages, above 10 will hardly log anything", default = "10")
    parser.add_argument("--throw", action="store_true", help="Throw exception on error", default=False)
    parser.add_argument("--MAX_R", help="Run only for the first R genes", type=int, default=None)
Example #12
0
 def test_extract(self):
     gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_3())
     g = GWAS.extract(gwas, ["rs3", "rs6", "rs7"])
     assert_gwas_extracted_from_data_3(self, g)