def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope") return filters = {x[0]: x[1:] for x in args.filter} maf_filter = float(filters["MAF"][0]) if "MAF" in filters else None logging.info("Loading GTEX variant map") gtex_snp_key = GTExMisc.load_gtex_variant_to_rsid(args.annotation[0]) logging.info("Processing genotype") m = [] for mean, metadata, ids in ModelTraining.dosage_generator( args.genotype, gtex_snp_key, dosage_conversion=ModelTraining._mean, do_none=True): if maf_filter: f = mean / 2 if mean < 1 else 1 - mean / 2 if f < maf_filter: continue m.append(metadata) m = Utilities.to_dataframe(m, [x[1] for x in Genotype.MetadataTFE.order]) if "TOP_CHR_POS_BY_FREQ" in filters: logging.info("Simplifying multi-allelic variants") m = Genotype._monoallelic_by_frequency(m) logging.info("Saving...") Utilities.save_dataframe(m, args.output) logging.info("Finished")
def run(args): if os.path.exists(args.output_file): logging.info("Output %s exists. Nope", args.output_file) return results_order = [] results = {} logging.info("Streaming file for groups") for i,line in Utilities.iterate_file(args.input_file): if i==0: continue comps = line.strip().split() key = comps[0] if not key in results: results_order.append(key) results[key] = 0 logging.log(9, "Key: %s", str(key)) results[key] += 1 r = [] logging.info("Producing output") for key in results_order: r.append((key, results[key])) r = pandas.DataFrame(r, columns=["key","count"]) logging.info("Saving") Utilities.ensure_requisite_folders(args.output_file) Utilities.save_dataframe(r, args.output_file) logging.info("Finished.")
def sink(self, cov, ids, region): logging.log(9, "Serializing covariance") _region = "{}_{}_{}_{}".format(region.name, region.chr, region.start, region.stop) if args.text_output: if args.dapg_output: raise RuntimeError("Not supported for this option") else: cov = matrices._flatten_matrix_data([(_region, ids, cov)]) self.of.sink(cov) elif args.text_output_folder: if args.dapg_output: f = os.path.join(args.text_output_folder, _region) + ".txt.gz" with gzip.open(f, "w") as o: for i in range(0, cov.shape[0]): l = "\t".join(["{:0.4f}".format(x) for x in cov[i]]) + "\n" o.write(l.encode()) id = os.path.join(args.text_output_folder, _region) + ".id.txt.gz" with gzip.open(id, "w") as o: l = "\n".join(ids).encode() o.write(l) else: cov = matrices._flatten_matrix_data_2(ids, cov) cov = pandas.DataFrame(cov)[["id1", "id2", "value"]] f = os.path.join(args.text_output_folder, _region) + ".txt.gz" Utilities.save_dataframe(cov, f)
def process_original_gwas(args, imputed): logging.info("Processing GWAS file %s", args.gwas_file) g = pandas.read_table(args.gwas_file) g = g.assign(current_build="hg38", imputation_status="original")[COLUMN_ORDER] # Remember the palindromic snps are to be excluded from the input GWAS; logging.info("Read %d variants", g.shape[0]) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": g = g.loc[~g.panel_variant_id.isin(imputed.panel_variant_id)] elif args.keep_criteria == "CHR_POS": g = g.assign(k=gwas_k(g)) imputed = imputed.assign(k=gwas_k(imputed)) g = g.loc[~g.k.isin({x for x in imputed.k})] g.drop("k", axis=1, inplace=True) imputed.drop("k", axis=1, inplace=True) else: raise RuntimeError("Unsupported keep option") logging.info("Kept %d variants as observed", g.shape[0]) g = pandas.concat([g, imputed])[COLUMN_ORDER] logging.info("%d variants", g.shape[0]) logging.info("Filling median") g = Genomics.fill_column_to_median(g, "sample_size", numpy.int32) logging.info("Sorting by chromosome-position") g = Genomics.sort(g) logging.info("Saving") Utilities.save_dataframe(g, args.output) return g[["panel_variant_id"]]
def run(args): Coloc.initialize(args.coloc_script) if os.path.exists(args.output): logging.info("Output exists. Nope.") return start = timer() logging.info("Loading gwas") gwas = Coloc.read_gwas(args.gwas, args.gwas_sample_size, args.gwas_mode) streamer = Coloc.eqtl_streamer(args.eqtl, gwas) results = [] logging.info("Beggining process") MAX_N = args.MAX_N for i, d in enumerate(streamer): gene = d.gene_id.values[0] logging.log(9, "Processing gene %s", gene) eqtl = Coloc.get_eqtl(d, args.eqtl_sample_size, args.eqtl_mode) r = Coloc.coloc_on_gwas_eqtl(gene, gwas, eqtl, args.gwas_mode, args.eqtl_mode, args.p1, args.p2, args.p12) results.append(r) if MAX_N and i > MAX_N: logging.info("Early exit") break logging.info("Saving") results = Coloc.results_to_dataframe(results) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(results, args.output) end = timer() logging.info("Finished COLOC in %s seconds" % (str(end - start)))
def run(args): start = timer() if os.path.exists(args.output_folder): logging.info("Output folder exists. Nope.") return if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder exists. Nope.") return stats = [] context = DAPUtilities.context_from_args(args) available_genes = context.get_available_genes() for i,gene in enumerate(available_genes): if args.MAX_M and i==args.MAX_M: break _start = timer() logging.log(8, "Processing %i/%i:%s", i+1, len(available_genes), gene) _stats = RunDAP.run_dap(context, gene) _end = timer() logging.log(7, "Elapsed: %s", str(_end - _start)) stats.append(_stats) end = timer() logging.info("Ran DAP in %s seconds" % (str(end - start))) Utilities.ensure_requisite_folders(args.output_folder) stats_ = args.stats_name if args.stats_name else "stats.txt" stats_path = os.path.join(args.output_folder, stats_) stats = RunDAP.data_frame_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, stats_path)
def run(args): if os.path.exists(args.output): logging.info("%s exists. Nope.", args.output) return logging.info("Loading regions") regions = pandas.read_table( args.region_file).rename(columns={"chr": "chromosome"}) regions.dropna(inplace=True) regions.start = regions.start.astype(int) regions.stop = regions.stop.astype(int) logging.info("Loading gwas") gwas = pandas.read_table( args.gwas_file, usecols=["panel_variant_id", "chromosome", "position", "zscore"]) gwas.dropna(inplace=True) logging.info("Processing") sliced = [] for i, region in enumerate(regions.itertuples()): logging.log(8, "Processing region %d", i + 1) if numpy.isnan(region.start) or numpy.isnan(region.stop) or \ (type(region.chromosome) != str and numpy.isnan(region.chromosome)): logging.log(8, "skipping incomplete region") continue slice = gwas[(gwas.chromosome == region.chromosome) & (gwas.position >= region.start) & (gwas.position < region.stop)] slice = slice.sort_values(by="position") if slice.shape[0] == 0: continue slice = slice.assign(region="region-{}-{}-{}".format( region.chromosome, region.start, region.stop), r=i) slice = slice[["panel_variant_id", "region", "r", "zscore"]] sliced.append(slice) sliced = pandas.concat(sliced).sort_values(by="r") if args.output_format == "dapg": sliced.region = sliced.r.apply(lambda x: "region{}".format(x)) sliced = sliced.drop(["r"], axis=1) Utilities.save_dataframe(sliced, args.output, header=False) elif args.output_format == "gtex_eqtl": sliced = sliced.assign(gene_id=sliced.region, variant_id=sliced.panel_variant_id, tss_distance=numpy.nan, ma_samples=numpy.nan, ma_count=numpy.nan, maf=numpy.nan, pval_nominal=numpy.nan, slope=sliced.zscore, slope_se=1) sliced = sliced[[ "gene_id", "variant_id", "tss_distance", "ma_samples", "ma_count", "maf", "pval_nominal", "slope", "slope_se" ]] Utilities.save_dataframe(sliced, args.output, header=True) logging.info("Finished slicing gwas")
def run(args): logging.info("Loading models") weights, extra = Models.read_model(args.input) Utilities.save_dataframe(weights, args.output_prefix + "_weights.txt.gz") Utilities.save_dataframe(extra, args.output_prefix + "_extra.txt.gz") logging.info("Done")
def save_study(study, selected_snps, simulated_gencode, prefix, _save): Utilities.ensure_requisite_folders(prefix) _save(study) selected_snps_ = prefix + ".selected_snps.txt.gz" Utilities.write_iterable_to_file(selected_snps, selected_snps_) gencode_path = os.path.join(os.path.split(prefix)[0], "gene_annotation.txt.gz") Utilities.save_dataframe(simulated_gencode, gencode_path)
def run(args): if os.path.exists(args.cs_output) or os.path.exists(args.var_output): logging.info("Output exists. Nope.") return study, variants_whitelist = get_study(args.parquet_genotype_folder, args.parquet_genotype_pattern, args.parquet_genotype_metadata) #_skip = lambda x: x not in variants_whitelist columns = ["maf", "pval_nominal", "slope", "slope_se"] eqtl_streamer = DataFrameStreamer.data_frame_streamer(args.eqtl, sanitize=True, to_numeric=columns, sentinel_column="gene_id") individuals = None if not args.restrict_to_individuals else TextFileTools.load_list(args.restrict_to_individuals) genes = None if not args.restrict_to_genes else set(TextFileTools.load_list(args.restrict_to_genes)) cs_results = [] var_results = [] logging.info("Beggining process") MAX_N=args.MAX_N n=args.sample_size for i, d in enumerate(eqtl_streamer): if MAX_N and i > MAX_N: logging.info("Early exit") break gene = d.gene_id.values[0] if genes is not None and gene.split('.')[0] not in genes: logging.log(9, "Skipping gene: %s", gene) continue logging.log(9, "Processing gene %i:%s", i+1, gene) d = d.loc[(~d.slope_se.isnull()) & (d.slope!=0) & (~d.slope.isnull())] try: res_, d_ = _do_susie(d, study, variants_whitelist, n, individuals, args.mode) cs, vars =_process_result(res_, d_, gene) except Exception as e: logging.log(9, "Error while doing susie:\n%s", traceback.format_exc()) cs = _void_cs("susie_error").assign(gene_id=gene, pp_sum=None) vars = _void_var().assign(gene_id=[gene], var_id=[None]) cs_results.append(cs) #if vars.shape[1]>0: var_results.append(vars) if len(cs_results) > 0: logging.info("Saving") cs_results = pandas.concat(cs_results)[["gene_id", "cs", "cs_avg_r2", "cs_log10bf", "cs_min_r2", "var_id", "pp_sum", "status"]] Utilities.ensure_requisite_folders(args.cs_output) Utilities.save_dataframe(cs_results, args.cs_output) else: logging.info('No results') if len(var_results) > 0: var_results = pandas.concat(var_results)[["gene_id", "var_id", "cs", "variable_prob"]] Utilities.ensure_requisite_folders(args.var_output) Utilities.save_dataframe(var_results, args.var_output) logging.info("Ran susie")
def save_expression(intermediate_folder, gene, d_, features_data_): y_folder = _y_folder(intermediate_folder, gene) os.makedirs(_y_folder(intermediate_folder, gene)) for k, v in d_.items(): if not gene in v: logging.log(8, "%s not present in %s", gene, k) continue p = os.path.join(y_folder, k) + ".txt" v = v.merge(features_data_[["individual", "id"]], on="individual")[["id", gene]] Utilities.save_dataframe(v, p, header=False)
def run(args): logging.info("Loading model summaries") extra = _read_2(args.input_prefix, "_summary.txt.gz") extra = extra[extra["n.snps.in.model"] > 0] if "rho_avg" in extra: extra = extra[(extra["pred.perf.pval"] < 0.05) & (extra.rho_avg > 0.1)] else: extra = extra[(extra["pred.perf.pval"] < 0.05)] extra = extra.assign(rho_avg=None) if not "pred.perf.qval" in extra: extra["pred.perf.qval"] = None if "nested_cv_converged" in extra: extra.nested_cv_converged = extra.nested_cv_converged.astype( numpy.int32) logging.info("Loading weights") weights = _read_2(args.input_prefix, "_weights.txt.gz") weights = weights[weights.gene.isin(extra.gene)] if args.output_prefix: logging.info("Saving dbs and covariance") db = args.output_prefix + ".db" logging.info("Saving db") Models.create_model_db(db, extra, weights) logging.info("Processing covariances") genes = {x for x in extra.gene} path_ = os.path.split(args.input_prefix) r = re.compile(path_[1] + "_covariance.txt.gz") files = sorted([x for x in os.listdir(path_[0]) if r.search(x)]) files = [os.path.join(path_[0], x) for x in files] cov = args.output_prefix + ".txt.gz" with gzip.open(cov, "w") as cov_: cov_.write("GENE RSID1 RSID2 VALUE\n".encode()) for nf, f in enumerate(files): logging.log(9, "file %i/%i: %s", nf, len(files), f) with gzip.open(f) as f_: f_.readline() for l in f_: gene = l.decode().strip().split()[0] if not gene in genes: continue cov_.write(l) if args.output_prefix_text: logging.info("Saving text output") Utilities.save_dataframe(weights, args.output_prefix_text + "_t_weights.txt") Utilities.save_dataframe(extra, args.output_prefix_text + "_t_extra.txt") logging.info("Done")
def save_x(intermediate_folder, gene, features_, features_data_): Utilities.save_dataframe(features_data_.drop("individual", axis=1), _x_path(intermediate_folder, gene), header=False, sep=" ") Utilities.save_dataframe( features_[["id", "allele_0", "allele_1"]].rename(columns={ "id": "SNP", "allele_0": "REF.0.", "allele_1": "ALT.1." }), _info_path(intermediate_folder, gene))
def run(args): start = timer() if os.path.exists(args.output_folder): logging.info("Output folder exists. Nope.") return if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder exists. Nope.") return os.makedirs(args.intermediate_folder) os.makedirs(args.output_folder) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table( args.parquet_genotype_metadata).to_pandas() else: features_metadata = pq.ParquetFile( args.parquet_genotype_metadata).read_row_group(args.chromosome - 1).to_pandas() logging.info("Opening features") features = pq.ParquetFile(args.parquet_genotype) logging.info("Opening summary stats") summary_stats = load_summary_stats(args.summary_stats) summary_stats = summary_stats[summary_stats.variant_id.isin( features_metadata.id)] regions = summary_stats[["region_id"]].drop_duplicates() if args.sub_batches is not None and args.sub_batch is not None: regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) stats = [] for i, region in enumerate(regions.itertuples()): logging.log(9, "Region %i/%i:%s", i, regions.shape[0], region.region_id) _stats = run_dapg(region, features, features_metadata, summary_stats, args.intermediate_folder, args.output_folder, args.options, args.dap_command, not args.keep_intermediate_folder) stats.append(_stats) stats_path = os.path.join(args.output_folder, "stats.txt") stats = RunDAP.data_frame_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, stats_path) end = timer() logging.info("Ran DAP in %s seconds" % (str(end - start)))
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Acquiring files") logic = Utilities.file_logic_2(args.input_folder, args.input_pattern, args.name_subfield, args.input_filter) trait_map = None if args.trait_map: logging.info("Loading file mapping") trait_map = get_trait_map(args.trait_map) gene_id_map, gene_name_map = None, None if args.gene_annotation: logging.info("Loading gene annotation") gene_id_map, gene_name_map = get_gene_map(args.gene_annotation) logging.info("Processing files") r = [] for f in logic.itertuples(): logging.info("Processing %s", f.file) names = get_header_names(args.header_names) if args.separator == ",": d = pandas.read_csv(f.path, header='infer' if not names else None, names=names) elif args.separator is None: d = pandas.read_table(f.path, header='infer' if not names else None, names=get_header_names(args.header_names), sep="\s+") else: raise RuntimeError("Unsupported separator") if args.specific_post_processing == "FAST_ENLOC": d = fast_enloc_postprocessing(d, gene_id_map, gene_name_map) elif args.specific_post_processing: raise RuntimeError("Unsupported postprocessing option") d = d.assign(trait=trait_map[f.trait], tissue=f.tissue) r.append(d) r = pandas.concat(r) logging.info("Saving") Utilities.save_dataframe(r, args.output) logging.info("Finished processing.")
def run(args): if os.path.exists(args.output): logging.info("output path %s exists. Nope.", args.output) return start = timer() logging.info("Parsing input GWAS") d = GWAS.load_gwas(args.gwas_file, args.output_column_map, force_special_handling=args.force_special_handling, skip_until_header=args.skip_until_header, separator=args.separator, handle_empty_columns=args.handle_empty_columns, input_pvalue_fix=args.input_pvalue_fix, enforce_numeric_columns=args.enforce_numeric_columns) logging.info("loaded %d variants", d.shape[0]) d = pre_process_gwas(args, d) if args.fill_from_snp_info: d = fill_coords(args, d) if args.chromosome_format: d = d.assign(chromosome=Genomics.to_int(d.chromosome)) d = d.assign(chromosome=["chr{}".format(x) for x in d.chromosome]) if args.liftover: d = liftover(args, d) if args.snp_reference_metadata: d = fill_from_metadata(args, d, extra_col_dict=load_extra_col_key_value_pairs( args.meta_extra_col)) if args.output_order: order = args.output_order for c in order: if not c in d: d = d.assign(**{c: numpy.nan}) d = d[order] d = clean_up(d) logging.info("Saving...") Utilities.save_dataframe(d, args.output, fill_na=True) end = timer() logging.info("Finished converting GWAS in %s seconds", str(end - start))
def process_imputed(args): r = re.compile(args.pattern) files = sorted([x for x in os.listdir(args.folder) if r.search(x)]) count = 0 keys = set() for i, file in enumerate(files): logging.info("Processing imputed %s", file) p = os.path.join(args.folder, file) g = pandas.read_table(p) if g.shape[0] == 0: logging.info("Empty set of results for %s", p) continue count += g.shape[0] #Fast dropping of observed values #g = g.merge(observed_ids, on="panel_variant_id", how="left", copy=False, indicator=True) #g = g[g._merge == "left_only"] g.drop(["n", "n_indep", "most_extreme_z"], axis=1, inplace=True) g.rename(columns={ "effect_allele_frequency": "frequency", "status": "imputation_status" }, inplace=True) g = g.assign(pvalue=2 * stats.norm.sf(numpy.abs(g.zscore)), effect_size=numpy.nan, standard_error=numpy.nan, sample_size=numpy.nan, current_build="hg38") g = g[COLUMN_ORDER] Utilities.save_dataframe(g, args.output, mode="a" if i > 0 else "w", header=i == 0) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": keys.update(g.panel_variant_id.values) elif args.keep_criteria == "CHR_POS": chr_pos = g.apply( lambda x: "{}_{}".format(x.chromosome, int(x.position)), axis=1) keys.update(chr_pos) else: raise RuntimeError("Unsupported keep option") logging.info("Processed %d imputed variants", count) return keys
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return if args.output_column_map: selected = [x[0] for x in args.output_column_map] else: selected = [ Gencode.GFTF.K_GENE_ID, Gencode.GFTF.K_GENE_NAME, Gencode.GFTF.K_GENE_TYPE ] logging.info("Loading Gencode") gencode = Gencode.load( args.gencode_file, feature_type_whitelist={x for x in args.feature_type_whitelist}, gene_type_white_list={x for x in args.gene_type_whitelist}, transcript_type_whitelist={x for x in args.transcript_type_whitelist}, selected_key_value_pairs=selected) #gencode = _reformat(gencode) logging.info("Converting format") if args.output_column_map: gencode = gencode.rename( columns={x[0]: x[1] for x in args.output_column_map}) if "gene_version" in gencode and "gene_id" in gencode: gencode["gene_id"] = gencode.gene_id + "." + gencode.gene_version keep = [ "chromosome", "start_location", "end_location", "feature_type", "strand" ] + [ x[1] for x in args.output_column_map if x[1] not in {"gene_version"} ] gencode = gencode[keep] else: gencode = gencode[[ "chromosome", "start_location", "end_location", "feature_type", "strand" ] + [x[1] for x in args.output_column_map]] logging.info("Saving") Utilities.save_dataframe(gencode, args.output) logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return start = timer() logging.info("Beginning process") if args.by_region_file: results = run_by_region(args) else: results = run_by_variant(args) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(results, args.output) end = timer() logging.info("Finished in %s seconds", str(end - start))
def run(args): logging.info("Starting process") vf = pq.ParquetFile(args.parquet_genotype_metadata) m = None last_chromosome = None r = [] for i, line in Utilities.iterate_file(args.regions): if i == 0: continue comps = line.strip().split() count, m, last_chromosome = count_variants(comps[0], comps[1], comps[2], vf, m, last_chromosome, args) r.append((comps[0], comps[1], comps[2], count)) r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"]) Utilities.save_dataframe(r, args.output) logging.info("Finished process")
def run(args): r_ = pandas.read_csv if ".csv" in args.input else pandas.read_table sep = "," if ".csv" in args.output else "\t" logging.info("Loading gene table") g = KeyedDataSource.load_data(args.gene_table, "gene_id", "gene_name") logging.info("Loading input") i = r_(args.input) gene_name = [] for t in i.itertuples(): gene_name.append(g[t.gene]) i["gene_name"] = gene_name logging.info("saving") Utilities.save_dataframe(i, args.output, sep=sep) logging.info("Done")
def process_original_gwas(args, imputed_keys): logging.info("Processing GWAS file %s", args.gwas_file) g = pandas.read_table(args.gwas_file) #Remember the palindromic snps are to be excluded from the input GWAS; logging.info("Read %d variants", g.shape[0]) if not args.keep_all_observed: if args.keep_criteria == "GTEX_VARIANT_ID": g = g.loc[~g.panel_variant_id.isin(imputed_keys)] elif args.keep_criteria == "CHR_POS": g["k"] = g.apply(_gwas_k, axis=1) g = g.loc[~g.k.isin(imputed_keys)] g.drop("k", axis=1, inplace=True) else: raise RuntimeError("Unsupported keep option") logging.info("Kept %d variants as observed", g.shape[0]) g = g.assign(current_build="hg38", imputation_status="original")[COLUMN_ORDER] Utilities.save_dataframe(g, args.output, mode="a", header=False) return g[["panel_variant_id"]]
def run(args): start = timer() if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder already exists. Nope.") return else: os.makedirs(args.intermediate_folder) if not (args.output_stats or args.output_weights or args.output_covariance or args.output_hyperparameters): logging.info("Specify at least one output") return if args.output_weights: Utilities.ensure_requisite_folders(args.output_weights) if args.output_stats: Utilities.ensure_requisite_folders(args.output_stats) if args.output_covariance: Utilities.ensure_requisite_folders(args.output_covariance) if args.output_hyperparameters: Utilities.ensure_requisite_folders(args.output_hyperparameters) weights = [] covariance = [] stats = [] hyperparameters = [] context = GEMMAUtilities.context_from_args(args) n = len(context.get_available_genes()) for i, gene in enumerate(context.get_available_genes()): start_ = timer() logging.log(8, "Processing %d/%d:%s", i + 1, n, gene) weights_, covariance_, hyperparameters_, stats_ = RunGEMMA.run_gemma( context, gene) end_ = timer() logging.log(7, "Elapsed: %s", str(end_ - start_)) if args.output_weights: weights.append(weights_) if args.output_covariance: covariance.append(covariance_) if args.output_hyperparameters: hyperparameters.append(hyperparameters_) if args.output_stats: stats.append(stats_) if args.output_weights: weights = pandas.concat(weights) Utilities.save_dataframe(weights, args.output_weights) if args.output_stats: stats = RunGEMMA.dataframe_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, args.output_stats) if args.output_covariance: covariance = RunGEMMA.dataframe_from_covariance_data( covariance).fillna("NA") Utilities.save_dataframe(covariance, args.output_covariance) if args.output_hyperparameters: hyperparameters = RunGEMMA.dataframe_from_hyperparameters( hyperparameters).fillna("NA") Utilities.save_dataframe(hyperparameters, args.output_hyperparameters) shutil.rmtree(args.intermediate_folder) end = timer() logging.info("Ran BSLMM in %s seconds" % (str(end - start)))
def run(args): d = duplicated_entries(args.input_folder) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(d, args.output, sep=",", quoting=csv.QUOTE_NONE)
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation( args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin( set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info( "Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin( x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [s] })[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval" ] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i, data_annotation_ in enumerate( data_annotation.itertuples()): if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, args.nested_cv_folds) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds) logging.info("Finished")
def _dump(p, d, cov): Utilities.save_dataframe(d, p + "_d.txt.gz") import gzip with gzip.open(p + "_m.txt.gz", "w") as f: for i in cov: f.write("{}\n".format("\t".join(map(str, i))).encode())
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation(args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch, args.simplify_data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin(available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin(set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile(args.features_annotation).read_row_group(args.chromosome-1).to_pandas() if args.output_rsids: if not args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.warning("Several variants map to a same rsid (hint: multiple INDELS?).\n" "Can't proceed. Consider the using the --keep_highest_frequency_rsid flag, or models will be ill defined.") return if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation(features_metadata, data_annotation, args.window) logging.info("Kept %d", features_metadata.shape[0]) if args.variant_call_filter: logging.info("Filtering variants by average call rate") features_metadata = features_metadata[features_metadata.avg_call > args.variant_call_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_r2_filter: logging.info("Filtering variants by imputation R2") features_metadata = features_metadata[features_metadata.r2 > args.variant_r2_filter] logging.info("Kept %d", features_metadata.shape[0]) if args.variant_variance_filter: logging.info("Filtering variants by (dosage/2)'s variance") features_metadata = features_metadata[features_metadata["std"]/2 > numpy.sqrt(args.variant_variance_filter)] logging.info("Kept %d", features_metadata.shape[0]) if args.discard_palindromic_snps: logging.info("Discarding palindromic snps") features_metadata = Genomics.discard_gtex_palindromic_variants(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.rsid_whitelist: logging.info("Filtering features annotation for whitelist") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin(whitelist)] logging.info("Kept %d", features_metadata.shape[0]) if args.only_rsids: logging.info("discarding non-rsids") features_metadata = StudyUtilities.trim_variant_metadata_to_rsids_only(features_metadata) logging.info("Kept %d", features_metadata.shape[0]) if args.keep_highest_frequency_rsid_entry and features_metadata[(features_metadata.rsid != "NA") & features_metadata.rsid.duplicated()].shape[0]: logging.info("Keeping only the highest frequency entry for every rsid") k = features_metadata[["rsid", "allele_1_frequency", "id"]] k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] = 1 - k.loc[k.allele_1_frequency > 0.5, "allele_1_frequency"] k = k.sort_values(by=["rsid", "allele_1_frequency"], ascending=False) k = k.groupby("rsid").first().reset_index() features_metadata = features_metadata[features_metadata.id.isin(k.id)] logging.info("Kept %d", features_metadata.shape[0]) else: logging.info("rsids are unique, no need to restrict to highest frequency entry") if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info("Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin(x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({"run":[args.run_tag], "cv_seed":[s]})[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS=["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"] SUMMARY_FIELDS=["gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols available_individuals = check_missing(args, data, features) with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i,data_annotation_ in enumerate(data_annotation.itertuples()): if args.MAX_M and i>=args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i+1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds, use_individuals=available_individuals) logging.info("Finished")
def run(args): Utilities.ensure_requisite_folders(args.output) logging.info("Loading db snp file") #db_snp_mapping = load_dbsnp_mapping(args.dbsnp_file) logging.info("processing") files = sorted(args.info_files, key=chr_key) r = [] variant_key = {} for p in files: with gzip.open(p) as f: logging.info("%s", p) for i, l in enumerate(f): if i == 0: continue # if i > 20000: # break comps = l.decode().strip().split() variant = comps[0] variant_comps = variant.split(":") chr = "chr" + variant_comps[0] pos = variant_comps[1] ref = variant_comps[2] alt = variant_comps[3] if "CN" in ref or "CN" in alt: continue freq = comps[3] variant_id = "{}_{}_{}_{}_b37".format(chr, pos, ref, alt) r.append((chr, pos, variant_id, ref, alt, freq)) k = "{}_{}".format(chr, pos) if not k in variant_key: variant_key[k] = [] variant_key[k].append(variant_id) r = pandas.DataFrame(data=r, columns=[ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency" ]) variant_key = {} for t in r.itertuples(): k = "{}_{}".format(t.chromosome, t.position) if not k in variant_key: variant_key[k] = [] variant_key[k].append(t.id) logging.info("looking for rsids in ucsc dbsnp file") dbsnp_mapping = load_dbsnp_mapping(args.dbsnp_file, variant_key) rsids = [] for id in r.id: rsids.append(dbsnp_mapping[id] if id in dbsnp_mapping else "NA") r["rsid"] = rsids logging.info("Saving") Utilities.save_dataframe(r, args.output) logging.info("Done")
def run(args): Utilities.maybe_create_folder(args.intermediate_folder) Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") p_ = re.compile(args.data_name_pattern) f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)] tissue_names = [p_.search(x).group(1) for x in f] data = [] for i in range(0, len(tissue_names)): logging.info("Loading %s", tissue_names[i]) data.append((tissue_names[i], pq.ParquetFile(os.path.join(args.data_folder, f[i])))) data = collections.OrderedDict(data) available_data = { x for p in data.values() for x in p.metadata.schema.names } logging.info("Preparing output") WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ] Utilities.ensure_requisite_folders(args.output_prefix) if args.skip_regression: weights, summaries, covariances = None, None, None else: weights, summaries, covariances = setup_output(args.output_prefix, tissue_names, WEIGHTS_FIELDS, SUMMARY_FIELDS) logging.info("Loading data annotation") data_annotation = StudyUtilities._load_gene_annotation( args.data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.chromosome or (args.sub_batches and args.sub_batch): data_annotation = StudyUtilities._filter_gene_annotation( data_annotation, args.chromosome, args.sub_batches, args.sub_batch) logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") seed = numpy.random.randint(1e8) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [seed] })[["run", "cv_seed"]] for t in tissue_names: Utilities.save_dataframe( d, "{}_{}_runs.txt.gz".format(args.output_prefix, t)) failed_run = False try: for i, data_annotation_ in enumerate(data_annotation.itertuples()): logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) logging.log(8, "loading data") d_ = {} for k, v in data.items(): d_[k] = Parquet._read(v, [data_annotation_.gene_id], to_pandas=True) features_ = Genomics.entries_for_gene_annotation( data_annotation_, args.window, features_metadata) if features_.shape[0] == 0: logging.log(9, "No features available") continue features_data_ = Parquet._read(features, [x for x in features_.id.values], to_pandas=True) features_data_["id"] = range(1, features_data_.shape[0] + 1) features_data_ = features_data_[["individual", "id"] + [x for x in features_.id.values]] logging.log(8, "training") prepare_ctimp(args.script_path, seed, args.intermediate_folder, data_annotation_, features_, features_data_, d_) del (features_data_) del (d_) if args.skip_regression: continue subprocess.call([ "bash", _execution_script(args.intermediate_folder, data_annotation_.gene_id) ]) w = pandas.read_table(_weights(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") s = pandas.read_table(_summary(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") for e_, entry in enumerate(s.itertuples()): entry_weights = w[["SNP", "REF.0.", "ALT.1.", entry.tissue]].rename( columns={ "SNP": "varID", "REF.0.": "ref_allele", "ALT.1.": "eff_allele", entry.tissue: "weight" }) entry_weights = entry_weights[entry_weights.weight != 0] entry_weights = entry_weights.assign( gene=data_annotation_.gene_id) entry_weights = entry_weights.merge(features_, left_on="varID", right_on="id", how="left") entry_weights = entry_weights[WEIGHTS_FIELDS] if args.output_rsids: entry_weights.loc[entry_weights.rsid == "NA", "rsid"] = entry_weights.loc[ entry_weights.rsid == "NA", "varID"] weights[entry.tissue].write( entry_weights.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) entry_summary = s[s.tissue == entry.tissue].rename( columns={ "zscore_pval": "pred.perf.pval", "rho_avg_squared": "pred.perf.R2" }) entry_summary = entry_summary.assign( gene=data_annotation_.gene_id, alpha=0.5, genename=data_annotation_.gene_name, gene_type=data_annotation_.gene_type, n_snps_in_window=features_.shape[0]) entry_summary["n.snps.in.model"] = entry_weights.shape[0] #must repeat strings beause of weird pandas indexing issue entry_summary = entry_summary.drop( ["R2", "n", "tissue"], axis=1)[[ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ]] summaries[entry.tissue].write( entry_summary.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) features_data_ = Parquet._read( features, [x for x in entry_weights.varID.values], to_pandas=True) var_ids = [x for x in entry_weights.varID.values] cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1) ids = [x for x in entry_weights.rsid.values ] if args.output_rsids else var_ids cov = matrices._flatten_matrix_data([(data_annotation_.gene_id, ids, cov)]) for cov_ in cov: l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2], cov_[3]).encode() covariances[entry.tissue].write(l) if not args.keep_intermediate_folder: logging.info("Cleaning up") shutil.rmtree( _intermediate_folder(args.intermediate_folder, data_annotation_.gene_id)) if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break except Exception as e: logging.info("Exception running model training:\n%s", traceback.format_exc()) failed_run = True finally: pass # if not args.keep_intermediate_folder: # shutil.rmtree(args.intermediate_folder) if not args.skip_regression: set_down(weights, summaries, covariances, tissue_names, failed_run) logging.info("Finished")