def test_gwas_from_source(self): #full format, OR+SE (which is like beta+se) gwas_format = { "column_snp":"SNPID", "column_non_effect_allele":"A2", "column_effect_allele":"A1", "column_or":"OR", "column_se":"SE", "column_chromosome":"HG19CHRC", "column_position":"BP" } source = GWASUtilities.gwas_filtered_source("tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz") gwas = GWAS.load_gwas(source, gwas_format) assert_gwas_zscore_fbse(self, gwas) source = GWASUtilities.gwas_filtered_source("tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz", snps={"rs940550", "rs6650104", "rs61770173"}, snp_column_name="SNPID") gwas = GWAS.load_gwas(source, gwas_format) numpy.testing.assert_array_equal(gwas[SNP], pandas.Series(["rs940550", "rs6650104", "rs61770173", ], dtype=numpy.str)) numpy.testing.assert_array_equal(gwas[EFFECT_ALLELE], pandas.Series(["C", "T", "A"], dtype=numpy.str)) numpy.testing.assert_array_equal(gwas[NON_EFFECT_ALLELE], pandas.Series(["G", "C", "C"], dtype=numpy.str)) numpy.testing.assert_array_equal(gwas[CHROMOSOME], pandas.Series(["chr1", "chr1", "chr22"], dtype=numpy.str)) numpy.testing.assert_allclose(gwas[ZSCORE], pandas.Series([-1.254557, 0.974874, -0.232505],dtype=numpy.float32), rtol=0.001) numpy.testing.assert_allclose(gwas[BETA], pandas.Series([-0.0217038334437866, 0.0193025022544974, -0.00369682484428976], dtype=numpy.float32), rtol=0.001) numpy.testing.assert_allclose(gwas[SE], pandas.Series([0.0173, 0.0198, 0.0159], dtype=numpy.float32), rtol=0.001)
def test_gwas_from_data(self): gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1()) assert_gwas_1(self, gwas) gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_2()) assert_gwas_2(self, gwas) gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1_e(), extra_columns=[("number",6), ("character",7)]) assert_gwas_1_e(self, gwas)
def test_gwas_from_source(self): #full format, OR+SE (which is like beta+se) gwas_format = { "column_snp": "SNPID", "column_non_effect_allele": "A2", "column_effect_allele": "A1", "column_or": "OR", "column_se": "SE", "column_chromosome": "HG19CHRC", "column_position": "BP" } source = GWASUtilities.gwas_filtered_source( "tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz") gwas = GWAS.load_gwas(source, gwas_format) assert_gwas_zscore_fbse(self, gwas) source = GWASUtilities.gwas_filtered_source( "tests/_td/GWAS/scz2/scz2.gwas.results.txt.gz", snps={"rs940550", "rs6650104", "rs61770173"}, snp_column_name="SNPID") gwas = GWAS.load_gwas(source, gwas_format) numpy.testing.assert_array_equal( gwas[SNP], pandas.Series([ "rs940550", "rs6650104", "rs61770173", ], dtype=numpy.str)) numpy.testing.assert_array_equal( gwas[EFFECT_ALLELE], pandas.Series(["C", "T", "A"], dtype=numpy.str)) numpy.testing.assert_array_equal( gwas[NON_EFFECT_ALLELE], pandas.Series(["G", "C", "C"], dtype=numpy.str)) numpy.testing.assert_array_equal( gwas[CHROMOSOME], pandas.Series(["chr1", "chr1", "chr22"], dtype=numpy.str)) numpy.testing.assert_allclose(gwas[ZSCORE], pandas.Series( [-1.254557, 0.974874, -0.232505], dtype=numpy.float32), rtol=0.001) numpy.testing.assert_allclose( gwas[BETA], pandas.Series([ -0.0217038334437866, 0.0193025022544974, -0.00369682484428976 ], dtype=numpy.float32), rtol=0.001) numpy.testing.assert_allclose(gwas[SE], pandas.Series([0.0173, 0.0198, 0.0159], dtype=numpy.float32), rtol=0.001)
def test_gwas_from_data(self): gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1()) assert_gwas_1(self, gwas) gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_2()) assert_gwas_2(self, gwas) gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_1_e(), extra_columns=[("number", 6), ("character", 7)]) assert_gwas_1_e(self, gwas)
def build_betas(args, model, gwas_format, name): logging.info("Building beta for %s and %s", name, args.model_db_path if args.model_db_path else "no database") load_from = os.path.join(args.gwas_folder, name) if model or args.skip_until_header: snps = model.snps() if model else None snp_column_name = args.snp_column if model else None load_from = GWASUtilities.gwas_filtered_source( load_from, snps=snps, snp_column_name=snp_column_name, skip_until_header=args.skip_until_header, separator=args.separator) sep = '\s+' if args.separator is None else args.separator b = GWAS.load_gwas(load_from, gwas_format, sep=sep, input_pvalue_fix=args.input_pvalue_fix) if model is not None: PF = PredictionModel.WDBQF base = model.weights[[ PF.K_RSID, PF.K_EFFECT_ALLELE, PF.K_NON_EFFECT_ALLELE ]].drop_duplicates() b = align_data_to_alleles(b, base, Constants.SNP, PF.K_RSID) b = b.fillna("NA") keep = [GWAS.SNP, GWAS.ZSCORE] if GWAS.BETA in b: keep.append(GWAS.BETA) b = b[keep] return b
def readGWAS(args): start = timer() validate(args) regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) print "INFO: Reading GWAS data" gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None model = None # dataframe r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) print("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile( args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort( ) #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % ( args.gwas_folder, args.gwas_file_pattern, ) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model( args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info( "%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" % (str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r, b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds" % (str(end - start))) return r
def run(args): start = timer() validate(args) if args.gwas_folder: regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else None names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp) names.sort() #cosmetic, because different filesystems/OS yield folders in different order if len(names) == 0: msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,) raise Exceptions.ReportableException(msg) else: names = [args.gwas_file] gwas_format = GWASUtilities.gwas_format_from_args(args) GWAS.validate_format_basic(gwas_format) GWAS.validate_format_for_strict(gwas_format) model = PredictionModel.load_model(args.model_db_path, args.model_db_snp_key) if args.model_db_path else None if args.output_folder: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) for name in names: output_path = os.path.join(args.output_folder, name) if not ".gz" in output_path: output_path += ".gz" if os.path.exists(output_path): logging.info("%s already exists, delete it if you want it to be done again", output_path) continue b = build_betas(args, model, gwas_format, name) c = "gzip" if ".gz" in name else None b.to_csv(output_path, sep="\t", index=False, compression=c) end = timer() logging.info("Successfully ran GWAS input processing in %s seconds" %(str(end - start))) else: r = pandas.DataFrame() for name in names: b = build_betas(args, model, gwas_format, name) r = pandas.concat([r,b]) end = timer() logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start))) return r
help="weight dbs to be used") #GWAS betas parser.add_argument( "--gwas_folder", help= "name of folder containing GWAS data. All files in the folder are assumed to belong to a single study.", default="data/GWAS") parser.add_argument( "--gwas_file_pattern", help= "Pattern to recognice GWAS files in folders (in case there are extra files and you don't want them selected).", default=None) GWASUtilities.add_gwas_arguments_to_parser(parser) parser.add_argument( "--separator", help= "Character or string separating fields in input file. Defaults to any whitespace.", default=None) # Added to support GWAS utilities parser.add_argument( "--skip_until_header", help= "Some files may be malformed and contain unespecified bytes in the beggining." " Specify this option (string value) to identify a header up to which file contents should be skipped.", default=None) # ZScore calculation
def test_extract(self): gwas = GWASUtilities.gwas_from_data(SampleData.sample_gwas_data_3()) g = GWAS.extract(gwas, ["rs3", "rs6", "rs7"]) assert_gwas_extracted_from_data_3(self, g)
same organization. In these cases, users should pay careful attention to the arguments --covariance_directory and --covariance_suffix. """) parser.add_argument("--model_db_snp_key", help="Specify a key to use as snp_id") parser.add_argument("-v", "--version", help="Report the version", action="store_true", default=False) #weight db model parser.add_argument('weight_dbs', metavar='DB', type=argparse.FileType('r'), nargs='+', help="weight dbs to be used") #GWAS betas parser.add_argument("--gwas_file", help="Load a single GWAS file. (Alternative to providing a gwas_folder and gwas_file_pattern)") parser.add_argument("--gwas_folder", help="name of folder containing GWAS data. All files in the folder are assumed to belong to a single study.") parser.add_argument("--gwas_file_pattern", help="Pattern to recognize GWAS files in folders (in case there are extra files and you don't want them selected).") GWASUtilities.add_gwas_arguments_to_parser(parser) # ZScore calculation parser.add_argument("--stream_covariance", help="Option to better handle large covariances, slower but less memory consuming", action="store_true") parser.add_argument("--single_snp_model", action="store_true", help="Models are comprised of a single snp per gene", default=False) parser.add_argument("--covariance_directory", help="directory where covariance files can be found (or SAME if covariance sits beside the .db file", default="SAME") parser.add_argument("--covariance_suffix", help="Suffix associated with the covariance files. covext-dbext (where ..dbext is the portion of the db file to be replaced by the coviarance extension. )", default=".txt.gz.._0.5.db") parser.add_argument("--output_directory", help="name of output file", default="results") parser.add_argument("--output_file_prefix", help="name of output file", default="results") parser.add_argument("--additional_output", help="If set, will output additional information.", action="store_true", default=False) parser.add_argument("--remove_ens_version", help="If set, will keep the -version- postfix in gene id.", action="store_true", default=False) parser.add_argument("--overwrite", help="If set, will overwrite the results file if it exists.", action="store_true", default=False) parser.add_argument("--verbosity", help="Log verbosity level. 1 is everything being logged. 10 is only high level messages, above 10 will hardly log anything", default = "10") parser.add_argument("--throw", action="store_true", help="Throw exception on error", default=False) parser.add_argument("--MAX_R", help="Run only for the first R genes", type=int, default=None)