def __enter__(self): logging.info("initializing resources") logging.info("Loading regions") regions = load_regions(self.args.region_file, self.args.chromosome) if args.sub_batches and args.sub_batch is not None: logging.log(9, "Selecting target regions from sub-batches") regions = PandasHelpers.sub_batch(regions, args.sub_batches, args.sub_batch) self.regions = regions logging.info("Opening variants metadata") self.vmf = pq.ParquetFile(args.parquet_genotype_metadata) logging.info("Creating destination") if args.text_output: if os.path.exists(args.text_output): raise RuntimeError("Output exists. Nope.") Utilities.ensure_requisite_folders(args.text_output) self.of = TextFileTools.TextDataSink( args.text_output, [("region", "id1", "id2", "value")]) self.of.initialize() elif args.text_output_folder: Utilities.maybe_create_folder(args.text_output_folder) else: raise RuntimeError("Unrecognized output specification") if (args.parquet_genotype_folder and args.parquet_genotype_pattern): self.file_map = get_file_map(args) else: raise RuntimeError("Unrecognized genotype specification") return self
def run(args): if not args.reentrant: if os.path.exists(args.output_folder): logging.info("Output path exists. Nope.") return Utilities.maybe_create_folder(args.output_folder) logging.info("Checking input folder") r = re.compile(args.rule) folders = [x for x in sorted(os.listdir(args.input_folder)) if r.search(x)] if args.exclude: folders = [x for x in folders if not x in {y for y in args.exclude}] names = {} for f in folders: name = r.search(f).group(1) if not name in names: names[name] = [] names[name].append(os.path.join(args.input_folder, f)) _f = shutil.move if args.move else shutil.copy for name in sorted(names): logging.info("Processing %s", name) output_folder = os.path.join(args.output_folder, name) Utilities.maybe_create_folder(output_folder) for input_folder in names[name]: logging.log(8, "Processing %s", input_folder) files = os.listdir(input_folder) for file in files: i = os.path.join(input_folder, file) o = os.path.join(output_folder, file) _f(i, o) logging.info("Finished collapse")
def run(args): Utilities.maybe_create_folder(args.intermediate_folder) Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Opening data") p_ = re.compile(args.data_name_pattern) f = [x for x in sorted(os.listdir(args.data_folder)) if p_.search(x)] tissue_names = [p_.search(x).group(1) for x in f] data = [] for i in range(0, len(tissue_names)): logging.info("Loading %s", tissue_names[i]) data.append((tissue_names[i], pq.ParquetFile(os.path.join(args.data_folder, f[i])))) data = collections.OrderedDict(data) available_data = { x for p in data.values() for x in p.metadata.schema.names } logging.info("Preparing output") WEIGHTS_FIELDS = [ "gene", "rsid", "varID", "ref_allele", "eff_allele", "weight" ] SUMMARY_FIELDS = [ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ] Utilities.ensure_requisite_folders(args.output_prefix) if args.skip_regression: weights, summaries, covariances = None, None, None else: weights, summaries, covariances = setup_output(args.output_prefix, tissue_names, WEIGHTS_FIELDS, SUMMARY_FIELDS) logging.info("Loading data annotation") data_annotation = StudyUtilities._load_gene_annotation( args.data_annotation) data_annotation = data_annotation[data_annotation.gene_id.isin( available_data)] if args.chromosome or (args.sub_batches and args.sub_batch): data_annotation = StudyUtilities._filter_gene_annotation( data_annotation, args.chromosome, args.sub_batches, args.sub_batch) logging.info("Kept %i entries", data_annotation.shape[0]) logging.info("Opening features annotation") if not args.chromosome: features_metadata = pq.read_table(args.features_annotation).to_pandas() else: features_metadata = pq.ParquetFile( args.features_annotation).read_row_group(args.chromosome - 1).to_pandas() if args.chromosome and args.sub_batches: logging.info("Trimming variants") features_metadata = StudyUtilities.trim_variant_metadata_on_gene_annotation( features_metadata, data_annotation, args.window) if args.rsid_whitelist: logging.info("Filtering features annotation") whitelist = TextFileTools.load_list(args.rsid_whitelist) whitelist = set(whitelist) features_metadata = features_metadata[features_metadata.rsid.isin( whitelist)] logging.info("Opening features") features = pq.ParquetFile(args.features) logging.info("Setting R seed") seed = numpy.random.randint(1e8) if args.run_tag: d = pandas.DataFrame({ "run": [args.run_tag], "cv_seed": [seed] })[["run", "cv_seed"]] for t in tissue_names: Utilities.save_dataframe( d, "{}_{}_runs.txt.gz".format(args.output_prefix, t)) failed_run = False try: for i, data_annotation_ in enumerate(data_annotation.itertuples()): logging.log(9, "processing %i/%i:%s", i + 1, data_annotation.shape[0], data_annotation_.gene_id) logging.log(8, "loading data") d_ = {} for k, v in data.items(): d_[k] = Parquet._read(v, [data_annotation_.gene_id], to_pandas=True) features_ = Genomics.entries_for_gene_annotation( data_annotation_, args.window, features_metadata) if features_.shape[0] == 0: logging.log(9, "No features available") continue features_data_ = Parquet._read(features, [x for x in features_.id.values], to_pandas=True) features_data_["id"] = range(1, features_data_.shape[0] + 1) features_data_ = features_data_[["individual", "id"] + [x for x in features_.id.values]] logging.log(8, "training") prepare_ctimp(args.script_path, seed, args.intermediate_folder, data_annotation_, features_, features_data_, d_) del (features_data_) del (d_) if args.skip_regression: continue subprocess.call([ "bash", _execution_script(args.intermediate_folder, data_annotation_.gene_id) ]) w = pandas.read_table(_weights(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") s = pandas.read_table(_summary(args.intermediate_folder, data_annotation_.gene_id), sep="\s+") for e_, entry in enumerate(s.itertuples()): entry_weights = w[["SNP", "REF.0.", "ALT.1.", entry.tissue]].rename( columns={ "SNP": "varID", "REF.0.": "ref_allele", "ALT.1.": "eff_allele", entry.tissue: "weight" }) entry_weights = entry_weights[entry_weights.weight != 0] entry_weights = entry_weights.assign( gene=data_annotation_.gene_id) entry_weights = entry_weights.merge(features_, left_on="varID", right_on="id", how="left") entry_weights = entry_weights[WEIGHTS_FIELDS] if args.output_rsids: entry_weights.loc[entry_weights.rsid == "NA", "rsid"] = entry_weights.loc[ entry_weights.rsid == "NA", "varID"] weights[entry.tissue].write( entry_weights.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) entry_summary = s[s.tissue == entry.tissue].rename( columns={ "zscore_pval": "pred.perf.pval", "rho_avg_squared": "pred.perf.R2" }) entry_summary = entry_summary.assign( gene=data_annotation_.gene_id, alpha=0.5, genename=data_annotation_.gene_name, gene_type=data_annotation_.gene_type, n_snps_in_window=features_.shape[0]) entry_summary["n.snps.in.model"] = entry_weights.shape[0] #must repeat strings beause of weird pandas indexing issue entry_summary = entry_summary.drop( ["R2", "n", "tissue"], axis=1)[[ "gene", "genename", "gene_type", "alpha", "n_snps_in_window", "n.snps.in.model", "rho_avg", "pred.perf.R2", "pred.perf.pval" ]] summaries[entry.tissue].write( entry_summary.to_csv(sep="\t", index=False, header=False, na_rep="NA").encode()) features_data_ = Parquet._read( features, [x for x in entry_weights.varID.values], to_pandas=True) var_ids = [x for x in entry_weights.varID.values] cov = numpy.cov([features_data_[k] for k in var_ids], ddof=1) ids = [x for x in entry_weights.rsid.values ] if args.output_rsids else var_ids cov = matrices._flatten_matrix_data([(data_annotation_.gene_id, ids, cov)]) for cov_ in cov: l = "{} {} {} {}\n".format(cov_[0], cov_[1], cov_[2], cov_[3]).encode() covariances[entry.tissue].write(l) if not args.keep_intermediate_folder: logging.info("Cleaning up") shutil.rmtree( _intermediate_folder(args.intermediate_folder, data_annotation_.gene_id)) if args.MAX_M and i >= args.MAX_M: logging.info("Early abort") break except Exception as e: logging.info("Exception running model training:\n%s", traceback.format_exc()) failed_run = True finally: pass # if not args.keep_intermediate_folder: # shutil.rmtree(args.intermediate_folder) if not args.skip_regression: set_down(weights, summaries, covariances, tissue_names, failed_run) logging.info("Finished")