def run(args): if os.path.exists(args.output_file): logging.info("Output %s exists. Nope", args.output_file) return results_order = [] results = {} logging.info("Streaming file for groups") for i,line in Utilities.iterate_file(args.input_file): if i==0: continue comps = line.strip().split() key = comps[0] if not key in results: results_order.append(key) results[key] = 0 logging.log(9, "Key: %s", str(key)) results[key] += 1 r = [] logging.info("Producing output") for key in results_order: r.append((key, results[key])) r = pandas.DataFrame(r, columns=["key","count"]) logging.info("Saving") Utilities.ensure_requisite_folders(args.output_file) Utilities.save_dataframe(r, args.output_file) logging.info("Finished.")
def run(args): Utilities.ensure_requisite_folders(args.output) logging.info("starting lifting over.") liftover = pyliftover.LiftOver(args.liftover) with gzip.open(args.output, "w") as _o: with open(args.input) as _i: for i,line in enumerate(_i): if i ==0: line = "\t".join(line.strip().split()) + "\n" _o.write(line.encode()) continue try: comps = line.strip().split() chr = comps[0] start = int(comps[1]) end = int(comps[2]) _chrs, _s = _l(liftover, chr, start) _chre, _e = _l(liftover, chr, end) if _chrs != _chre: logging.warning("{}:{}:{} have different target chromosomes: {}/{}".format(chr, start, end, _chrs, _chre)) line = "{}\n".format("\t".join([_chrs, str(_s), str(_e)])) _o.write(line.encode()) except Exception as e: logging.info("Error for: %s", line) logging.info("Finished lifting over.")
def run(args): logging.info("Starting") Utilities.ensure_requisite_folders(args.output) logging.info("Read covariate") covariate = pq.read_table(args.covariate).to_pandas() logging.info("Read data") data = pq.read_table(args.data).to_pandas() logging.info("Processing") covariate_names = covariate.columns.values[1:] results = {"individual": data.individual.values} variables = [x for x in data.columns.values[1:]] for i, column in enumerate(variables): logging.log(9, "%i/%i:%s", i, len(variables), column) d = data[["individual", column]].rename(columns={ column: "y" }).merge(covariate, on="individual", how="inner").drop("individual", axis=1) y, X = dmatrices("y ~ {}".format(" + ".join(covariate_names)), data=d, return_type="dataframe") model = sm.OLS(y, X) result = model.fit() results[column] = result.resid results = pandas.DataFrame(results)[["individual"] + variables] Parquet.save_variable(args.output, results) logging.info("Finished")
def run(args): Coloc.initialize(args.coloc_script) if os.path.exists(args.output): logging.info("Output exists. Nope.") return start = timer() logging.info("Loading gwas") gwas = Coloc.read_gwas(args.gwas, args.gwas_sample_size, args.gwas_mode) streamer = Coloc.eqtl_streamer(args.eqtl, gwas) results = [] logging.info("Beggining process") MAX_N = args.MAX_N for i, d in enumerate(streamer): gene = d.gene_id.values[0] logging.log(9, "Processing gene %s", gene) eqtl = Coloc.get_eqtl(d, args.eqtl_sample_size, args.eqtl_mode) r = Coloc.coloc_on_gwas_eqtl(gene, gwas, eqtl, args.gwas_mode, args.eqtl_mode, args.p1, args.p2, args.p12) results.append(r) if MAX_N and i > MAX_N: logging.info("Early exit") break logging.info("Saving") results = Coloc.results_to_dataframe(results) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(results, args.output) end = timer() logging.info("Finished COLOC in %s seconds" % (str(end - start)))
def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") snp_key = KeyedDataSource.load_data(args.snp_annotation_file, "varID", "rsid_dbSNP150", should_skip=KeyedDataSource.skip_na) logging.info("Loading Genotype") genotype, individual_ids = ModelTraining.load_genotype_folder( args.input_genotype_folder, args.input_genotype_file_pattern, snp_key) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype) logging.info("Processing Expression Phenotype") expression_logic = Utilities.file_logic( args.input_phenotype_folder, args.input_phenotype_expression_pattern) for row in expression_logic.itertuples(): logging.info("Phenotype: %s", row.name) process_phenotype(row.path, row.name, args.output_prefix) end = timer() logging.info("Finished in %s", str(end - start))
def __enter__(self): logging.info("initializing resources") logging.info("Loading regions") regions = load_regions(self.args.region_file, self.args.chromosome) if args.sub_batches and args.sub_batch is not None: logging.log(9, "Selecting target regions from sub-batches") regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) self.regions = regions logging.info("Opening variants metadata") self.vmf = pq.ParquetFile(args.parquet_genotype_metadata) logging.info("Creating destination") if args.text_output: if os.path.exists(args.text_output): raise RuntimeError("Output exists. Nope.") Utilities.ensure_requisite_folders(args.text_output) self.of = TextFileTools.TextDataSink( args.text_output, [("region", "id1", "id2", "value")]) self.of.initialize() elif args.text_output_folder: Utilities.maybe_create_folder(args.text_output_folder) else: raise RuntimeError("Unrecognized output specification") if (args.parquet_genotype_folder and args.parquet_genotype_pattern): self.file_map = get_file_map(args) else: raise RuntimeError("Unrecognized genotype specification") return self
def run(args): start = timer() if os.path.exists(args.output_folder): logging.info("Output folder exists. Nope.") return if os.path.exists(args.intermediate_folder): logging.info("Intermediate folder exists. Nope.") return stats = [] context = DAPUtilities.context_from_args(args) available_genes = context.get_available_genes() for i,gene in enumerate(available_genes): if args.MAX_M and i==args.MAX_M: break _start = timer() logging.log(8, "Processing %i/%i:%s", i+1, len(available_genes), gene) _stats = RunDAP.run_dap(context, gene) _end = timer() logging.log(7, "Elapsed: %s", str(_end - _start)) stats.append(_stats) end = timer() logging.info("Ran DAP in %s seconds" % (str(end - start))) Utilities.ensure_requisite_folders(args.output_folder) stats_ = args.stats_name if args.stats_name else "stats.txt" stats_path = os.path.join(args.output_folder, stats_) stats = RunDAP.data_frame_from_stats(stats).fillna("NA") Utilities.save_dataframe(stats, stats_path)
def run(args): Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading snp reference") key = KeyedDataSource.load_data(args.snp_reference_file, "variant_id", "rs_id_dbSNP150_GRCh38p7", value_conversion=KeyedDataSource.dot_to_na) logging.info("Loading samples") samples = TextFileTools.load_list(args.samples) genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n" og = args.output_prefix + "_genotype.txt.gz" oa = args.output_prefix + "_annotation.txt.gz" if os.path.exists(og) or os.path.exists(oa): logging.info("Output exists. Nope.") return logging.info("Processing") with gzip.open(args.genotype) as geno: with gzip.open(og, "w") as _og: _og.write(_to_gl(["varID"] + samples, genotype_format_string)) with gzip.open(oa, "w") as _oa: _oa.write( _to_al([ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency", "rsid" ])) for i, line in enumerate(geno): comps = line.decode().strip().split() chr = "chr" + comps[0] pos = comps[2] ref = comps[3] alt = comps[4] af = comps[5] dosage = comps[6:] var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt) if var_id in key: id = key[var_id] comps[1] = var_id _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id])) next var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref) if var_id in key and len(ref) == 1 and len(alt) == 1: id = key[var_id] af = str(1 - float(af)) dosage = list(map(lambda x: str(2 - int(x)), comps[6:])) _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id])) next logging.info("Finished conversion")
def run(args): if os.path.exists(args.output): logging.info("output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output) logging.info("Loading data annotation") gene_annotation = StudyUtilities.load_gene_annotation(args.gene_annotation) gene_annotation = gene_annotation.rename( {"gene_name": "genename"}, axis=1)[["gene_id", "genename", "gene_type"]] logging.info("Loading variant annotation") features_metadata = pq.read_table(args.features_annotation).to_pandas() logging.info("Loading spec") weights = get_weights(args.spec) w = weights.merge(features_metadata[["id", "allele_0", "allele_1", "rsid"]], on="id", how="left") w = w.rename( { "allele_0": "ref_allele", "allele_1": "eff_allele", "id": "varID" }, axis=1) w["gene"] = w.gene_id.str.cat(w.cluster_id.astype(str), sep="_") w = w.drop(["w", "cluster_id"], axis=1) w = w.sort_values(by="gene").assign(weight=1) logging.info("Building models") with sqlite3.connect(args.output) as conn: w.drop("gene_id", axis=1).fillna("NA")[[ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ]].to_sql("weights", conn, index=False) e = w[["gene_id", "gene"]].merge(gene_annotation, on="gene_id").drop("gene_id", axis=1) e["n_snps_in_window"] = None e["n.snps.in.model"] = 1 e["pred.perf.pval"] = None e["pred.perf.qval"] = None e["pred.perf.R2"] = None e = e[[ "gene", "genename", "gene_type", "n_snps_in_window", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval" ]] e.to_sql("extra", conn, index=False) Models.model_indexes(conn) logging.info("Finished")
def run(args): start = timer() Utilities.ensure_requisite_folders(args.parquet_output) logging.info("Loading variable") variables = ModelTraining.load_variable_file(args.variable_file) logging.info("Saving") Parquet.save_variable(args.parquet_output, variables) end = timer() logging.info("Finished in %s", str(end-start))
def save_study(study, selected_snps, simulated_gencode, prefix, _save): Utilities.ensure_requisite_folders(prefix) _save(study) selected_snps_ = prefix + ".selected_snps.txt.gz" Utilities.write_iterable_to_file(selected_snps, selected_snps_) gencode_path = os.path.join(os.path.split(prefix)[0], "gene_annotation.txt.gz") Utilities.save_dataframe(simulated_gencode, gencode_path)
def run(args): if os.path.exists(args.cs_output) or os.path.exists(args.var_output): logging.info("Output exists. Nope.") return study, variants_whitelist = get_study(args.parquet_genotype_folder, args.parquet_genotype_pattern, args.parquet_genotype_metadata) #_skip = lambda x: x not in variants_whitelist columns = ["maf", "pval_nominal", "slope", "slope_se"] eqtl_streamer = DataFrameStreamer.data_frame_streamer(args.eqtl, sanitize=True, to_numeric=columns, sentinel_column="gene_id") individuals = None if not args.restrict_to_individuals else TextFileTools.load_list(args.restrict_to_individuals) genes = None if not args.restrict_to_genes else set(TextFileTools.load_list(args.restrict_to_genes)) cs_results = [] var_results = [] logging.info("Beggining process") MAX_N=args.MAX_N n=args.sample_size for i, d in enumerate(eqtl_streamer): if MAX_N and i > MAX_N: logging.info("Early exit") break gene = d.gene_id.values[0] if genes is not None and gene.split('.')[0] not in genes: logging.log(9, "Skipping gene: %s", gene) continue logging.log(9, "Processing gene %i:%s", i+1, gene) d = d.loc[(~d.slope_se.isnull()) & (d.slope!=0) & (~d.slope.isnull())] try: res_, d_ = _do_susie(d, study, variants_whitelist, n, individuals, args.mode) cs, vars =_process_result(res_, d_, gene) except Exception as e: logging.log(9, "Error while doing susie:\n%s", traceback.format_exc()) cs = _void_cs("susie_error").assign(gene_id=gene, pp_sum=None) vars = _void_var().assign(gene_id=[gene], var_id=[None]) cs_results.append(cs) #if vars.shape[1]>0: var_results.append(vars) if len(cs_results) > 0: logging.info("Saving") cs_results = pandas.concat(cs_results)[["gene_id", "cs", "cs_avg_r2", "cs_log10bf", "cs_min_r2", "var_id", "pp_sum", "status"]] Utilities.ensure_requisite_folders(args.cs_output) Utilities.save_dataframe(cs_results, args.cs_output) else: logging.info('No results') if len(var_results) > 0: var_results = pandas.concat(var_results)[["gene_id", "var_id", "cs", "variable_prob"]] Utilities.ensure_requisite_folders(args.var_output) Utilities.save_dataframe(var_results, args.var_output) logging.info("Ran susie")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return logging.info("Loading samples") samples = {x for x in TextFileTools.load_list(args.samples_whitelist)} logging.info("Processing file") Utilities.ensure_requisite_folders(args.output) Utilities.write_iterable_to_file(input_generator(args.input_file, samples), args.output) logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Acquiring files") logic = Utilities.file_logic_2(args.input_folder, args.input_pattern, args.name_subfield, args.input_filter) trait_map = None if args.trait_map: logging.info("Loading file mapping") trait_map = get_trait_map(args.trait_map) gene_id_map, gene_name_map = None, None if args.gene_annotation: logging.info("Loading gene annotation") gene_id_map, gene_name_map = get_gene_map(args.gene_annotation) logging.info("Processing files") r = [] for f in logic.itertuples(): logging.info("Processing %s", f.file) names = get_header_names(args.header_names) if args.separator == ",": d = pandas.read_csv(f.path, header='infer' if not names else None, names=names) elif args.separator is None: d = pandas.read_table(f.path, header='infer' if not names else None, names=get_header_names(args.header_names), sep="\s+") else: raise RuntimeError("Unsupported separator") if args.specific_post_processing == "FAST_ENLOC": d = fast_enloc_postprocessing(d, gene_id_map, gene_name_map) elif args.specific_post_processing: raise RuntimeError("Unsupported postprocessing option") d = d.assign(trait=trait_map[f.trait], tissue=f.tissue) r.append(d) r = pandas.concat(r) logging.info("Saving") Utilities.save_dataframe(r, args.output) logging.info("Finished processing.")
def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") #TODO: make more generic variant_key = get_variant_key(args) if args.split_by_chromosome: generate_multi_backend(args, variant_key) else: generate_single_backend(args, variant_key) end = timer() logging.info("Finished in %s", str(end - start))
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Loading variant annotation") variants = KeyedDataSource.load_data(args.variant_annotation, "variant_id", args.rsid_column) logging.info("Loading data annotation") if len(args.data_annotation) == 1: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][data_annotation.feature_type == "gene"].drop_duplicates() elif len(args.data_annotation) == 2: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][ data_annotation.feature_type == args.data_annotation[1]].drop_duplicates() else: raise RuntimeError("Unsupported annotation length") logging.info("Loading model_input") data = pandas.read_table(args.model_input, usecols=["gene_id", "gene_name", "variant", "weight"]) logging.info("Processing") if args.model_filter and args.model_filter[1] == "PIP": w = Miscellaneous.dapg_signals(args.model_filter[0], float(args.model_filter[2]), variants) w = w.rename(columns={"gene":"gene_id", "variant_id":"variant"}) data = data.merge(w[["gene_id", "variant"]], on=["gene_id", "variant"]) v = pandas.DataFrame([(k,variants[k]) for k in data.variant.drop_duplicates()], columns=["variant", "rsid"]) v.loc[v.rsid == ".", "rsid"] = v.loc[v.rsid == ".", "variant"] weights = data.merge(v, on="variant") weights = weights.assign( ref_allele = weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(3)), eff_allele=weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(4))) weights = weights.rename(columns={"variant":"varID", "gene_id":"gene"})[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]] extra = data.groupby("gene_id").size().to_frame("n.snps.in.model").reset_index() extra = extra.merge(data_annotation[["gene_id", "gene_name", "gene_type"]], on="gene_id") extra["pred.perf.pval"] = None extra["pred.perf.qval"] = None extra["pred.perf.R2"] = None extra = extra[["gene_id", "gene_name", "gene_type", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]].rename(columns={"gene_id":"gene", "gene_name":"genename"}) logging.info("Saving db") Models.create_model_db(args.output, extra, weights) logging.info("Done")
def run(args): logging.info("Loading annotation") annotation = pandas.read_table(args.input_annotation) logging.info("Loading region") regions, genes = build_regions(annotation, args.chromosome, args.sub_jobs, args.window) file_name = os.path.split(args.input_file)[1] name = file_name.split(".txt.gz")[0] logging.info("Saving gene lists") gene_outputs = [ os.path.join(args.output_folder, name) + "_{}_genes.txt.gz".format(i) for i in range(1, args.sub_jobs + 1) ] for i, p in enumerate(gene_outputs): with gzip.open(p, "w") as f: genes_ = genes[i] for gene in genes_: f.write("{}\n".format(gene).encode()) logging.info("Processing file") outputs = [ os.path.join(args.output_folder, name) + "_{}.txt.gz".format(i) for i in range(1, args.sub_jobs + 1) ] Utilities.ensure_requisite_folders(outputs[0]) output_files = [gzip.open(x, "w") for x in outputs] with gzip.open(args.input_file) as input_file: header = input_file.readline() for f in output_files: f.write(header) for i, line in enumerate(input_file): comps = line.decode().strip().split() pos = int(comps[0].split("_")[1]) targets = regions[(regions.start <= pos) & (pos < regions.end)] for target in targets.itertuples(): f = output_files[target.Index] f.write(line) logging.info("Finalizing output files") for f in output_files: f.close() logging.info("Finished")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return start = timer() logging.info("Beginning process") if args.by_region_file: results = run_by_region(args) else: results = run_by_variant(args) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(results, args.output) end = timer() logging.info("Finished in %s seconds", str(end - start))
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Getting parquet genotypes") file_map = get_file_map(args) logging.info("Getting variants") gene_variants = get_gene_variant_list(args.model_db_folder, args.model_db_file_pattern) genes = list(gene_variants.gene.drop_duplicates()) Utilities.ensure_requisite_folders(args.output) logging.info("Processing") with gzip.open(args.output, "w") as f: f.write("GENE RSID1 RSID2 VALUE\n".encode()) for i, g in enumerate(gene_variants.gene.drop_duplicates()): logging.log(9, "Proccessing %i/%i:%s", i + 1, len(genes), g) w = gene_variants[gene_variants.gene == g] chr_ = w.varID.values[0].split("_")[0].split("chr")[1] if not n_.search(chr_): logging.log(9, "Unsupported chromosome: %s", chr_) continue dosage = file_map[int(chr_)] d = Parquet._read(dosage, columns=w.varID.values, skip_individuals=True) var_ids = list(d.keys()) if args.output_rsids: ids = [ x for x in pandas.DataFrame({ "varID": var_ids }).merge(w[["varID", "rsid"]], on="varID").rsid.values ] else: ids = var_ids c = numpy.cov([d[x] for x in var_ids]) c = matrices._flatten_matrix_data([(w.gene.values[0], ids, c)]) for entry in c: l = "{} {} {} {}\n".format(entry[0], entry[1], entry[2], entry[3]) f.write(l.encode()) logging.info("Finished building covariance.")
def run(args): logging.info("Loading annotation") annotation = pandas.read_table(args.gene_annotation, usecols=["chromosome", "gene_id"]) logging.info("Creating split map") split_map = get_split_map(annotation, args.splits, args.output_format) conversion = None if args.key_conversion == "INTRON": conversion = convert_to_intron Utilities.ensure_requisite_folders(args.output_format) logging.info("Processing") last_key=None last_file=None wrote_header=set() unmmapped=set() with gzip.open(args.input) as f: header = f.readline() for i,line in enumerate(f): comps = line.decode().split() gene_id = comps[0] if conversion: gene_id = conversion(gene_id) if not gene_id in split_map: if not gene_id in unmmapped: logging.log(9, "Unmapped gene %s", gene_id) unmmapped.add(gene_id) continue _split = split_map[gene_id] if last_key != _split[KEY]: if last_file: logging.log(9, "Closing %s", last_key) last_file.close() last_key = _split[KEY] logging.log(9, "Opening %s", last_key) last_file = gzip.open(_split[PATH], "a") if not last_key in wrote_header: last_file.write(header) wrote_header.add(last_key) last_file.write(line) logging.info("Finished processing")
def run(args): logging.info("Acquiring whitelist") #whitelist = get_gene_whitelist(args) logging.info("Processing...") Utilities.ensure_requisite_folders(args.output) with gzip.open(args.output, "w") as _o: _o.write("gene\tcluster\tchromosome\tstart\tend\n".encode()) with gzip.open(args.input) as _i: for i, line in enumerate(_i): if i == 0: continue comps = line.decode().strip().split() d = comps[3].split(":") o = "{}\t{}\t{}\t{}\t{}\n".format(d[4], d[3], d[0], d[1], d[2]).encode() _o.write(o) logging.info("Finished")
def run(args): if (not args.output and not args.output_blacklist) or (args.output and args.output_blacklist): logging.info("Provide only one output argument") return if args.output and os.path.exists(args.output): logging.info("Output path %s exists. Nope.", args.output) return if args.output_blacklist and os.path.exists(args.output_blacklist): logging.info("Output path for skipped variants %s exists. Nope.", args.output_blacklist) return start = timer() logging.info("Started parsing DB SNP file") if args.output: Utilities.ensure_requisite_folders(args.output) entries = Utilities.lineify( DBSnp.generate(args.input, args.fields, args.keep_zero_based, recode_observed=args.recode_observed)) Utilities.write_iterable_to_file(entries, args.output, Utilities.to_line(args.fields)) else: Utilities.ensure_requisite_folders(args.output_blacklist) entries = Utilities.lineify( DBSnp.generate_skips(args.input, args.fields, args.keep_zero_based, recode_observed=args.recode_observed)) Utilities.write_iterable_to_file( entries, args.output_blacklist, Utilities.to_line(args.fields + ["reason"])) end = timer() logging.info("Finished parsing at %s", str(end - start))
def run(args): start = timer() Utilities.ensure_requisite_folders(args.parquet_output) logging.info("Loading snp annotation") key_to_snp = load_annotation(args) logging.info("Processing eqtl") _skip = (lambda x: _skip_missing_in_key(x, key_to_snp)) if args.restrict_to_annotation else None streamer_ = DataFrameStreamer.data_frame_streamer(args.gtex_eqtl_file, sanitize=True, to_numeric=["maf", "pval_nominal", "slope", "slope_se"], sentinel_column="gene_id", additional_skip_row_check=_skip) with _data_sink(args) as sink: for i,d in enumerate(streamer_): if d.shape[0] == 0: logging.log(8, "Skipping %d", i) continue logging.log(8, "Processing %d/%s", i, d.gene_id[0]) p = _process(d, key_to_snp) sink.sink(p) end = timer() logging.info("Ran conversion in %s", str(end-start))
def run(args): r = re.compile(args.pattern) files = [x for x in os.listdir(args.folder) if r.search(x)] if args.sort_groups: files = sorted(files, key=lambda x: _key(x, r, args.sort_groups)) output_firstline = True Utilities.ensure_requisite_folders(args.output) logging.info("Starting concatenation") with gzip.open(args.output, "w") as o: for file in files: path = os.path.join(args.folder, file) logging.log(9, "Opening %s", path) for i, line in Utilities.iterate_file(path): if i==0: if output_firstline: o.write(line.encode()) if not args.headerless: output_firstline = False continue o.write(line.encode()) logging.info("Finished")
def run(args): wp = args.output_prefix + "_weights.txt.gz" if os.path.exists(wp): logging.info("Weights output exists already, delete it or move it") return sp = args.output_prefix + "_summary.txt.gz" if os.path.exists(sp): logging.info("Summary output exists already, delete it or move it") return cp = args.output_prefix + "_covariance.txt.gz" if os.path.exists(wp): logging.info("covariance output exists already, delete it or move it") return r = args.output_prefix + "_run.txt.gz" if os.path.exists(wp): logging.info("run output exists already, delete it or move it") return logging.info("Starting") Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") data = pq.ParquetFile(args.data) available_data = {x for x in data.metadata.schema.names} logging.info("Loading data annotation") data_annotation = StudyUtilities.load_gene_annotation( args.data_annotation, args.chromosome, args.sub_batches, args.sub_batch) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.gene_whitelist: logging.info("Applying gene whitelist") data_annotation = data_annotation[data_annotation.gene_id.isin( set(args.gene_whitelist))] logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] if args.features_weights: logging.info("Loading weights") x_weights = get_weights(args.features_weights, {x for x in features_metadata.id}) logging.info( "Filtering features metadata to those available in weights") features_metadata = features_metadata[features_metadata.id.isin( x_weights.id)] logging.info("Kept %d entries", features_metadata.shape[0]) else: x_weights = None logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") s = numpy.random.randint(1e8) set_seed(s) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [s] })[["run", "cv_seed"]] Utilities.save_dataframe(d, r) WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "test_R2_avg", "test_R2_sd", "cv_R2_avg", "cv_R2_sd", "in_sample_R2", "nested_cv_fisher_pval", "nested_cv_converged", "rho_avg", "rho_se", "rho_zscore", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval" ] train = train_elastic_net_wrapper if args.mode == "elastic_net" else train_ols with gzip.open(wp, "w") as w: w.write(("\t".join(WEIGHTS_FIELDS) + "\n").encode()) with gzip.open(sp, "w") as s: s.write(("\t".join(SUMMARY_FIELDS) + "\n").encode()) with gzip.open(cp, "w") as c: c.write("GENE RSID1 RSID2 VALUE\n".encode()) for i, data_annotation_ in enumerate( data_annotation.itertuples()): if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) if args.repeat: for j in range(0, args.repeat): logging.log(9, "%i-th reiteration", j) process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, j, args.nested_cv_folds) else: process(w, s, c, data, data_annotation_, features, features_metadata, x_weights, SUMMARY_FIELDS, train, nested_folds=args.nested_cv_folds) logging.info("Finished")
def run(args): Utilities.maybe_create_folder(args.intermediate_folder) Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") p_ = re.compile(args.data_name_pattern) f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)] tissue_names = [p_.search(x).group(1) for x in f] data = [] for i in range(0, len(tissue_names)): logging.info("Loading %s", tissue_names[i]) data.append((tissue_names[i], pq.ParquetFile(os.path.join(args.data_folder, f[i])))) data = collections.OrderedDict(data) available_data = { x for p in data.values() for x in p.metadata.schema.names } logging.info("Preparing output") WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ] Utilities.ensure_requisite_folders(args.output_prefix) if args.skip_regression: weights, summaries, covariances = None, None, None else: weights, summaries, covariances = setup_output(args.output_prefix, tissue_names, WEIGHTS_FIELDS, SUMMARY_FIELDS) logging.info("Loading data annotation") data_annotation = StudyUtilities._load_gene_annotation( args.data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.chromosome or (args.sub_batches and args.sub_batch): data_annotation = StudyUtilities._filter_gene_annotation( data_annotation, args.chromosome, args.sub_batches, args.sub_batch) logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") seed = numpy.random.randint(1e8) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [seed] })[["run", "cv_seed"]] for t in tissue_names: Utilities.save_dataframe( d, "{}_{}_runs.txt.gz".format(args.output_prefix, t)) failed_run = False try: for i, data_annotation_ in enumerate(data_annotation.itertuples()): logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) logging.log(8, "loading data") d_ = {} for k, v in data.items(): d_[k] = Parquet._read(v, [data_annotation_.gene_id], to_pandas=True) features_ = Genomics.entries_for_gene_annotation( data_annotation_, args.window, features_metadata) if features_.shape[0] == 0: logging.log(9, "No features available") continue features_data_ = Parquet._read(features, [x for x in features_.id.values], to_pandas=True) features_data_["id"] = range(1, features_data_.shape[0] + 1) features_data_ = features_data_[["individual", "id"] + [x for x in features_.id.values]] logging.log(8, "training") prepare_ctimp(args.script_path, seed, args.intermediate_folder, data_annotation_, features_, features_data_, d_) del (features_data_) del (d_) if args.skip_regression: continue subprocess.call([ "bash", _execution_script(args.intermediate_folder, data_annotation_.gene_id) ]) w = pandas.read_table(_weights(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") s = pandas.read_table(_summary(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") for e_, entry in enumerate(s.itertuples()): entry_weights = w[["SNP", "REF.0.", "ALT.1.", entry.tissue]].rename( columns={ "SNP": "varID", "REF.0.": "ref_allele", "ALT.1.": "eff_allele", entry.tissue: "weight" }) entry_weights = entry_weights[entry_weights.weight != 0] entry_weights = entry_weights.assign( gene=data_annotation_.gene_id) entry_weights = entry_weights.merge(features_, left_on="varID", right_on="id", how="left") entry_weights = entry_weights[WEIGHTS_FIELDS] if args.output_rsids: entry_weights.loc[entry_weights.rsid == "NA", "rsid"] = entry_weights.loc[ entry_weights.rsid == "NA", "varID"] weights[entry.tissue].write( entry_weights.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) entry_summary = s[s.tissue == entry.tissue].rename( columns={ "zscore_pval": "pred.perf.pval", "rho_avg_squared": "pred.perf.R2" }) entry_summary = entry_summary.assign( gene=data_annotation_.gene_id, alpha=0.5, genename=data_annotation_.gene_name, gene_type=data_annotation_.gene_type, n_snps_in_window=features_.shape[0]) entry_summary["n.snps.in.model"] = entry_weights.shape[0] #must repeat strings beause of weird pandas indexing issue entry_summary = entry_summary.drop( ["R2", "n", "tissue"], axis=1)[[ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ]] summaries[entry.tissue].write( entry_summary.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) features_data_ = Parquet._read( features, [x for x in entry_weights.varID.values], to_pandas=True) var_ids = [x for x in entry_weights.varID.values] cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1) ids = [x for x in entry_weights.rsid.values ] if args.output_rsids else var_ids cov = matrices._flatten_matrix_data([(data_annotation_.gene_id, ids, cov)]) for cov_ in cov: l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2], cov_[3]).encode() covariances[entry.tissue].write(l) if not args.keep_intermediate_folder: logging.info("Cleaning up") shutil.rmtree( _intermediate_folder(args.intermediate_folder, data_annotation_.gene_id)) if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break except Exception as e: logging.info("Exception running model training:\n%s", traceback.format_exc()) failed_run = True finally: pass # if not args.keep_intermediate_folder: # shutil.rmtree(args.intermediate_folder) if not args.skip_regression: set_down(weights, summaries, covariances, tissue_names, failed_run) logging.info("Finished")
def run(args): Utilities.ensure_requisite_folders(args.output) logging.info("Loading db snp file") #db_snp_mapping = load_dbsnp_mapping(args.dbsnp_file) logging.info("processing") files = sorted(args.info_files, key=chr_key) r = [] variant_key = {} for p in files: with gzip.open(p) as f: logging.info("%s", p) for i, l in enumerate(f): if i == 0: continue # if i > 20000: # break comps = l.decode().strip().split() variant = comps[0] variant_comps = variant.split(":") chr = "chr" + variant_comps[0] pos = variant_comps[1] ref = variant_comps[2] alt = variant_comps[3] if "CN" in ref or "CN" in alt: continue freq = comps[3] variant_id = "{}_{}_{}_{}_b37".format(chr, pos, ref, alt) r.append((chr, pos, variant_id, ref, alt, freq)) k = "{}_{}".format(chr, pos) if not k in variant_key: variant_key[k] = [] variant_key[k].append(variant_id) r = pandas.DataFrame(data=r, columns=[ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency" ]) variant_key = {} for t in r.itertuples(): k = "{}_{}".format(t.chromosome, t.position) if not k in variant_key: variant_key[k] = [] variant_key[k].append(t.id) logging.info("looking for rsids in ucsc dbsnp file") dbsnp_mapping = load_dbsnp_mapping(args.dbsnp_file, variant_key) rsids = [] for id in r.id: rsids.append(dbsnp_mapping[id] if id in dbsnp_mapping else "NA") r["rsid"] = rsids logging.info("Saving") Utilities.save_dataframe(r, args.output) logging.info("Done")
def run(args): logging.info("Processing...") Utilities.ensure_requisite_folders(args.output_prefix) spec = Utilities.file_logic(args.input_folder, args.input_pattern) with gzip.open(args.output_prefix + ".models.txt.gz", mode="w") as models: models.write("gene\tmodel\tn\tpp\tps\n".encode()) with gzip.open(args.output_prefix + ".models_variants.txt.gz", mode="w") as model_variants: model_variants.write("gene\tmodel\tvariant\n".encode()) with gzip.open(args.output_prefix + ".model_summary.txt.gz", mode="w") as model_summary: model_summary.write( "gene\tpes\tpes_se\tlog_nc\tlog10_nc\n".encode()) with gzip.open(args.output_prefix + ".variants_pip.txt.gz", mode="w") as variant_pip: variant_pip.write( "gene\trank\tvariant_id\tpip\tlog10_abf\tcluster_id\n". encode()) with gzip.open(args.output_prefix + ".clusters.txt.gz", mode="w") as clusters: clusters.write( "gene\tcluster\tn_snps\tpip\taverage_r2\n".encode( )) with gzip.open(args.output_prefix + ".cluster_correlations.txt.gz", mode="w") as cluster_correlations: cluster_correlations.write( "gene\tid1\tid2\tvalue\n".encode()) for i, t in enumerate(spec.itertuples()): logging.log(9, "Processing %s", t.name) written = set() with open(t.path) as dap: p, pse, lognc, log10nc = None, None, None, None for l in dap: s = model_re.search(l) if s: ml = parse_model_line(t.name, s) models.write(ml.encode()) vl = parse_model_line_for_variant( t.name, s) if vl: for vl_ in vl: model_variants.write( vl_.encode()) continue s = model_expected_size_re.search(l) if s: p, pse = parse_expected_size(s) continue s = lognc_re.search(l) if s: lognc, log10nc = parse_log_10_nc(s) model_summary.write( "{}\t{}\t{}\t{}\t{}\n".format( t.name, p, pse, lognc, log10nc).encode()) continue s = variant_re.search(l) if s: rank, id, pip, log10_abvf, cluster_id = parse_variant_line( s) variant_pip.write( "{}\t{}\t{}\t{}\t{}\t{}\n". format(t.name, rank, id, pip, log10_abvf, cluster_id).encode()) continue s = cluster_re.search(l) if s: id, n, pip, r2 = parse_cluster_line( s) clusters.write( "{}\t{}\t{}\t{}\t{}\n".format( t.name, id, n, pip, r2).encode()) _id1 = int(id) comps = s.group( "correlation").strip().split() for _id2 in range( 1, len(comps) + 1): if (_id1, _id2) in written or ( _id2, _id1) in written: continue comp = comps[_id2 - 1] cluster_correlations.write( "{}\t{}\t{}\t{}\n".format( t.name, _id1, _id2, comp).encode()) written.add((_id1, _id2)) logging.info("Finished")
def run(args): d = duplicated_entries(args.input_folder) Utilities.ensure_requisite_folders(args.output) Utilities.save_dataframe(d, args.output, sep=",", quoting=csv.QUOTE_NONE)
def run(args): if os.path.exists(args.output): logging.info("Output already exists, nope.") return Utilities.ensure_requisite_folders(args.output) Utilities.ensure_requisite_folders(args.discard) if args.liftover: logging.info("Acquiring liftover") l = pyliftover.LiftOver(args.liftover) else: logging.info("Will not perform lift over") l = None logging.info("Loading snp reference metadata") snp_reference_metadata = pandas.read_table(args.snp_reference_metadata) reference = {} for t in snp_reference_metadata.itertuples(): k = "chr{}_{}".format(t.chromosome, t.position) if k in reference: raise RuntimeError("coordinate is already present") reference[k] = (t.id, t.rsid) dbsnp_format = {x: i for i, x in enumerate(DBSnp.DBSNP._fields)} complement_translation = "CGTA".maketrans({"C": "G", "G": "C", "T":"A", "A": "T"}) logging.info("Processing db snp file") if args.discard: discard = gzip.open(args.discard, "w") discard.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"])) allele_re = re.compile("chr\d+_\d+_(.*)_(.*)_b38") with gzip.open(args.output, "w") as result: result.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"])) with gzip.open(args.db_snp_file) as db_snp: db_snp.readline() for i,line in enumerate(db_snp): comps = line.decode().strip().split("\t") obs_alleles = comps[9].split("/") if len(obs_alleles) < 2: continue chr = comps[1] start_0 = comps[2] _new_chromosome, _new_position = gwas_parsing._lift(l, chr, start_0) if l else (chr, int(start_0)) if _new_chromosome == "NA" or _new_position == "NA": continue k = "{}_{}".format(_new_chromosome, _new_position+1) if not k in reference: continue rsid = comps[4] strand = comps[6] ref_allele = comps[7] var_type = comps[11] alt_alleles_ = [x for x in obs_alleles if x != ref_allele] alt_alleles = set(alt_alleles_) panel_variant_id, panel_variant_rsid = reference[k] panel_variant_rsid = panel_variant_rsid if type(panel_variant_rsid) == str else "NA" panel_alleles = allele_re.search(panel_variant_id) panel_ref_allele = panel_alleles.group(1) panel_alt_allele = panel_alleles.group(2) strand_reversed_panel_ref_allele = panel_ref_allele.translate(complement_translation) strand_reversed_panel_alt_allele = panel_alt_allele.translate(complement_translation) # if args.reverse_swap: # strand_reversed_panel_ref_allele = strand_reversed_panel_ref_allele[::-1] # strand_reversed_panel_alt_allele = strand_reversed_panel_alt_allele[::-1] swap, strand_reversal, selected_ref_allele, selected_alt_allele = None, None, ref_allele, alt_alleles_[0] if len(panel_ref_allele) == 1 and len(panel_alt_allele) == 1: #snp if panel_ref_allele == ref_allele and panel_alt_allele in alt_alleles: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, panel_ref_allele, panel_alt_allele elif panel_ref_allele in alt_alleles and panel_alt_allele == ref_allele: swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1, 1, panel_alt_allele, panel_ref_allele elif strand_reversed_panel_ref_allele == ref_allele and strand_reversed_panel_alt_allele in alt_alleles: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, strand_reversed_panel_ref_allele, strand_reversed_panel_alt_allele elif strand_reversed_panel_ref_allele in alt_alleles and strand_reversed_panel_alt_allele == ref_allele: swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1, -1, strand_reversed_panel_alt_allele, strand_reversed_panel_ref_allele elif len(panel_ref_allele) > 1 and len(panel_alt_allele) == 1 and ref_allele != "-": #deletion deleted = panel_ref_allele[1:] strand_reversed_deleted = strand_reversed_panel_ref_allele[1:] # if args.reverse_swap: # strand_reversed_deleted = strand_reversed_panel_ref_allele[:-1] for si_, allele_ in enumerate(alt_alleles): if allele_ == deleted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, allele_, "-" if allele_ == strand_reversed_deleted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, allele_, "-" elif len(panel_ref_allele) == 1 and len(panel_alt_allele) > 1 and ref_allele == "-": inserted = panel_alt_allele[1:] strand_reversed_inserted = strand_reversed_panel_alt_allele[1:]#[:-1] # if args.reverse_swap: # strand_reversed_inserted = strand_reversed_panel_alt_allele[:-1] for si_, allele_ in enumerate(alt_alleles): if allele_ == inserted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, "-", allele_ if allele_ == strand_reversed_inserted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, "-", allele_ else: pass ol = l_([rsid, chr, str(int(start_0) + 1), selected_ref_allele, selected_alt_allele, strand, var_type, panel_variant_id, panel_variant_rsid, panel_ref_allele, panel_alt_allele, swap, strand_reversal]) if swap is not None and strand is not None and selected_ref_allele is not None and selected_alt_allele is not None: result.write(ol) else: discard.write(ol) discard.close() logging.info("Done")