def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope") return filters = {x[0]: x[1:] for x in args.filter} maf_filter = float(filters["MAF"][0]) if "MAF" in filters else None logging.info("Loading GTEX variant map") gtex_snp_key = GTExMisc.load_gtex_variant_to_rsid(args.annotation[0]) logging.info("Processing genotype") m = [] for mean, metadata, ids in ModelTraining.dosage_generator( args.genotype, gtex_snp_key, dosage_conversion=ModelTraining._mean, do_none=True): if maf_filter: f = mean / 2 if mean < 1 else 1 - mean / 2 if f < maf_filter: continue m.append(metadata) m = Utilities.to_dataframe(m, [x[1] for x in Genotype.MetadataTFE.order]) if "TOP_CHR_POS_BY_FREQ" in filters: logging.info("Simplifying multi-allelic variants") m = Genotype._monoallelic_by_frequency(m) logging.info("Saving...") Utilities.save_dataframe(m, args.output) logging.info("Finished")
def _process(d, key_to_snp, how="left"): k = [(k_, key_to_snp[k_]) for k_ in d.variant_id if k_ in key_to_snp] k = Utilities.to_dataframe(k, ["variant_id", "rsid"]) d = d.merge(k, on="variant_id", how=how) d = d.rename(columns={"gene_id":"gene", "pval_nominal":"pvalue", "slope":"beta", "slope_se":"se"}) d = d[list(GTEx.GTExAllAssociations._fields)] d = d.assign(maf = d.maf.astype(numpy.float32), beta = d.beta.astype(numpy.float32), se = d.se.astype(numpy.float32)) return d
def run(args): logging.info("Starting process") vf = pq.ParquetFile(args.parquet_genotype_metadata) m = None last_chromosome = None r = [] for i, line in Utilities.iterate_file(args.regions): if i == 0: continue comps = line.strip().split() count, m, last_chromosome = count_variants(comps[0], comps[1], comps[2], vf, m, last_chromosome, args) r.append((comps[0], comps[1], comps[2], count)) r = Utilities.to_dataframe(r, ["chromosome", "start", "end", "count"]) Utilities.save_dataframe(r, args.output) logging.info("Finished process")