def get_variant_key(args): v_ = lambda x: numpy.nan if x=="." else x snp = args.snp_annotation_file variant_key = None if len(snp) == 1: variant_key = KeyedDataSource.load_data(snp[0], "variant_id", "rs_id_dbSNP150_GRCh38p7", value_conversion=v_, key_filter=GenotypeUtilities.is_biallelic_variant) elif len(snp) == 2: if snp[1] == "METADATA": variant_key = KeyedDataSource.load_data(snp[0], "id", "rsid", value_conversion=v_) if not variant_key: raise RuntimeError("Need right info to process snp metadata") return variant_key
def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") snp_key = KeyedDataSource.load_data(args.snp_annotation_file, "varID", "rsid_dbSNP150", should_skip=KeyedDataSource.skip_na) logging.info("Loading Genotype") genotype, individual_ids = ModelTraining.load_genotype_folder( args.input_genotype_folder, args.input_genotype_file_pattern, snp_key) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype) logging.info("Processing Expression Phenotype") expression_logic = Utilities.file_logic( args.input_phenotype_folder, args.input_phenotype_expression_pattern) for row in expression_logic.itertuples(): logging.info("Phenotype: %s", row.name) process_phenotype(row.path, row.name, args.output_prefix) end = timer() logging.info("Finished in %s", str(end - start))
def run(args): Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading snp reference") key = KeyedDataSource.load_data(args.snp_reference_file, "variant_id", "rs_id_dbSNP150_GRCh38p7", value_conversion=KeyedDataSource.dot_to_na) logging.info("Loading samples") samples = TextFileTools.load_list(args.samples) genotype_format_string = "\t".join(["{}"] * (len(samples) + 1)) + "\n" og = args.output_prefix + "_genotype.txt.gz" oa = args.output_prefix + "_annotation.txt.gz" if os.path.exists(og) or os.path.exists(oa): logging.info("Output exists. Nope.") return logging.info("Processing") with gzip.open(args.genotype) as geno: with gzip.open(og, "w") as _og: _og.write(_to_gl(["varID"] + samples, genotype_format_string)) with gzip.open(oa, "w") as _oa: _oa.write( _to_al([ "chromosome", "position", "id", "allele_0", "allele_1", "allele_1_frequency", "rsid" ])) for i, line in enumerate(geno): comps = line.decode().strip().split() chr = "chr" + comps[0] pos = comps[2] ref = comps[3] alt = comps[4] af = comps[5] dosage = comps[6:] var_id = "{}_{}_{}_{}_b38".format(chr, pos, ref, alt) if var_id in key: id = key[var_id] comps[1] = var_id _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, ref, alt, af, id])) next var_id = "{}_{}_{}_{}_b38".format(chr, pos, alt, ref) if var_id in key and len(ref) == 1 and len(alt) == 1: id = key[var_id] af = str(1 - float(af)) dosage = list(map(lambda x: str(2 - int(x)), comps[6:])) _og.write( _to_gl([var_id] + dosage, genotype_format_string)) _oa.write(_to_al([chr, pos, var_id, alt, ref, af, id])) next logging.info("Finished conversion")
def run(args): if os.path.exists(args.output): logging.info("Output already exists, either delete it or move it") return logging.info("Loading snp names") snps = KeyedDataSource.load_data(args.snp_annotation, "variant_id", args.rsid_column) logging.info("Loading gene annotation") genes, types = _gene_annotation(args.gene_annotation) with sqlite3.connect(args.output) as conn: logging.info("Processing") streamer = DataFrameStreamer.data_frame_streamer(args.input, header=["tissue_name", "gene_id", "variant_id", "weight", "beta", "se"], to_numeric=["weight", "beta", "se"], sentinel_column="gene_id") extra = [] for i, d in enumerate(streamer): g_ = d.gene_id.values[0] logging.log(9, "processing %i:%s", i+1, g_) d = d.loc[d.weight != 0] if args.snp_zscore_threshold: d = d.assign(zscore=numpy.abs(d.beta / d.se)) d = d.loc[d.zscore > args.snp_zscore_threshold] if d.shape[0] == 0: logging.log(9, "no good snps left") continue extra.append((g_, genes[g_], types[g_], d.shape[0], numpy.nan, numpy.nan, numpy.nan)) d = d[["gene_id", "variant_id", "weight"]].rename(columns={"gene_id":"gene", "variant_id":"varID"}) effect, non_effect, rsid = [], [], [] for t in d.itertuples(): c_ = t.varID.split("_") effect.append(c_[3]) non_effect.append(c_[2]) r_ = snps[t.varID] rsid.append(r_ if r_ != "." else t.varID) d = d.assign(ref_allele = non_effect, eff_allele = effect, rsid = rsid)[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]] d.to_sql("weights", conn, index=False, if_exists="append") extra = pandas.DataFrame(extra, columns=["gene", "genename", "gene_type", "n.snps.in.model", "pred.perf.R2","pred.perf.pval", "pred.perf.qval"]) extra.to_sql("extra", conn, index=False) logging.info("Creating indices") Models.model_indexes(conn) logging.info("Finished building model.")
def run(args): if os.path.exists(args.output): logging.info("Output exists. Nope.") return Utilities.ensure_requisite_folders(args.output) logging.info("Loading variant annotation") variants = KeyedDataSource.load_data(args.variant_annotation, "variant_id", args.rsid_column) logging.info("Loading data annotation") if len(args.data_annotation) == 1: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][data_annotation.feature_type == "gene"].drop_duplicates() elif len(args.data_annotation) == 2: data_annotation = pandas.read_table(args.data_annotation[0]) data_annotation = data_annotation[["gene_id", "gene_name", "feature_type", "gene_type"]][ data_annotation.feature_type == args.data_annotation[1]].drop_duplicates() else: raise RuntimeError("Unsupported annotation length") logging.info("Loading model_input") data = pandas.read_table(args.model_input, usecols=["gene_id", "gene_name", "variant", "weight"]) logging.info("Processing") if args.model_filter and args.model_filter[1] == "PIP": w = Miscellaneous.dapg_signals(args.model_filter[0], float(args.model_filter[2]), variants) w = w.rename(columns={"gene":"gene_id", "variant_id":"variant"}) data = data.merge(w[["gene_id", "variant"]], on=["gene_id", "variant"]) v = pandas.DataFrame([(k,variants[k]) for k in data.variant.drop_duplicates()], columns=["variant", "rsid"]) v.loc[v.rsid == ".", "rsid"] = v.loc[v.rsid == ".", "variant"] weights = data.merge(v, on="variant") weights = weights.assign( ref_allele = weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(3)), eff_allele=weights.variant.str.replace("(.*)_(.*)_(.*)_(.*)_b38", lambda x: x.group(4))) weights = weights.rename(columns={"variant":"varID", "gene_id":"gene"})[["gene", "rsid", "varID", "ref_allele", "eff_allele", "weight"]] extra = data.groupby("gene_id").size().to_frame("n.snps.in.model").reset_index() extra = extra.merge(data_annotation[["gene_id", "gene_name", "gene_type"]], on="gene_id") extra["pred.perf.pval"] = None extra["pred.perf.qval"] = None extra["pred.perf.R2"] = None extra = extra[["gene_id", "gene_name", "gene_type", "n.snps.in.model", "pred.perf.R2", "pred.perf.pval", "pred.perf.qval"]].rename(columns={"gene_id":"gene", "gene_name":"genename"}) logging.info("Saving db") Models.create_model_db(args.output, extra, weights) logging.info("Done")
def run(args): r_ = pandas.read_csv if ".csv" in args.input else pandas.read_table sep = "," if ".csv" in args.output else "\t" logging.info("Loading gene table") g = KeyedDataSource.load_data(args.gene_table, "gene_id", "gene_name") logging.info("Loading input") i = r_(args.input) gene_name = [] for t in i.itertuples(): gene_name.append(g[t.gene]) i["gene_name"] = gene_name logging.info("saving") Utilities.save_dataframe(i, args.output, sep=sep) logging.info("Done")