def load(cls, file_path): """Load the dataset from an hdf5 dataset """ with HDF5Reader(file_path) as obj: data = obj.load_all() attrs = OrderedDict(obj.f.attrs) return cls(data, attrs)
def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.h5")) batch = prepare_batch(dl_batch, pred_batch_array) writer = HDF5BatchWriter(tmpfile, chunk_size=4) writer.batch_write(batch) writer.batch_write(batch) writer.close() with HDF5Reader(tmpfile) as f: assert np.all( list(f.batch_iter(2))[0]['metadata']['gene_id'] == dl_batch['metadata']['gene_id'][:2]) out = f.load_all() assert np.all(out['metadata']['gene_id'] == np.concatenate([ dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id'] ])) assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([ dl_batch['metadata']['ranges']['chr'], dl_batch['metadata'] ['ranges']['chr'] ])) assert np.all(out['metadata']['ranges']["start"] == np.concatenate([ dl_batch['metadata']['ranges']['start'], dl_batch['metadata'] ['ranges']['start'] ])) assert np.all(out['preds'][:3] == pred_batch_array)
def __init__(self, fpath): self.fpath = fpath self.f = HDF5Reader(self.fpath) self.f.open() # example ranges. loaded when needed self.ranges = None
def get_eval_predictions(tf, model, filter_dnase=False): """Get the predictions""" with HDF5Reader(os.path.join(eval_dir, tf, model + ".h5")) as r: y_pred = r.f['/preds'][:] labels_bed_file = os.path.join(root_dir, get_dl_kwargs(tf)['intervals_file']) df_unfiltered = pd.read_csv(labels_bed_file, sep="\t", header=None) df_unfiltered.columns = ['chr', 'start', 'end', 'y_true'] if filter_dnase: # Filter the DNase peaks based on the overlaps dnase_peaks = '{ddir}/raw/tfbinding/eval/tf-DREAM/DNASE.{ctype}.relaxed.narrowPeak.gz'.format( ddir=ddir, ctype=TF2CT[tf]) filtered_bed = BedTool(labels_bed_file).intersect(BedTool(dnase_peaks), u=True, wa=True, f=.5) df_filtered = pd.read_csv(filtered_bed.fn, sep="\t", header=None) df_filtered.columns = ['chr', 'start', 'end', 'y_true'] df_filtered['filtered'] = True keep = df_unfiltered.merge(df_filtered, how='left', on=list( df_unfiltered.columns)).filtered == True return df_unfiltered.y_true.values[keep], y_pred[keep] else: return df_unfiltered.y_true.values, y_pred[:]
def __init__(self, file_path, include_samples=None, default_imp_score='weighted'): self.file_path = file_path self.f = HDF5Reader(self.file_path) self.f.open() # use the hdf5 file handle self.data = self.f.f self.include_samples = include_samples self._hyp_contrib_cache = dict() self.default_imp_score = default_imp_score
def modisco_instances_to_bed(modisco_h5, instances_parq, imp_score_h5, output_dir, trim_frac=0.08): from basepair.modisco.pattern_instances import load_instances add_file_logging(output_dir, logger, 'modisco-instances-to-bed') output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) mr = ModiscoResult(modisco_h5) mr.open() print("load task_id") d = HDF5Reader(imp_score_h5) d.open() if 'hyp_imp' not in d.f.keys(): # backcompatibility d['hyp_imp'] = d['grads'] id_hash = pd.DataFrame({ "peak_id": d.f['/metadata/interval_from_task'][:], "example_idx": np.arange(d.f['/metadata/interval_from_task'].shape[0]) }) # load the instances data frame print("load all instances") df = load_instances(instances_parq, motifs=None, dedup=True) # import pdb # pdb.set_trace() df = df.merge(id_hash, on="example_idx") # append peak_id patterns = df.pattern.unique().tolist() pattern_pssms = { pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns } append_pattern_loc(df, pattern_pssms, trim_frac=trim_frac) # write out the results example_cols = [ 'example_chr', 'example_start', 'example_end', 'example_id', 'peak_id' ] df_examples = df[example_cols].drop_duplicates().sort_values( ["example_chr", "example_start"]) df_examples.to_csv(output_dir / "scored_regions.bed", sep='\t', header=False, index=False) df["pattern_start_rel"] = df.pattern_start + df.example_start df["pattern_end_rel"] = df.pattern_end + df.example_start df["strand"] = df.revcomp.astype(bool).map({True: "-", False: "+"}) # TODO - update this - ? pattern_cols = [ 'example_chr', 'pattern_start_rel', 'pattern_end_rel', 'example_id', 'percnormed_score', 'strand', 'peak_id', 'seqlet_score' ] (output_dir / "README").write_text("score_regions.bed columns: " + ", ".join(example_cols) + "\n" + "metacluster_<>/pattern_<>.bed columns: " + ", ".join(pattern_cols)) df_pattern = df[pattern_cols] for pattern in df.pattern.unique(): out_path = output_dir / (pattern + ".bed.gz") out_path.parent.mkdir(parents=True, exist_ok=True) dfp = df_pattern[df.pattern == pattern].drop_duplicates().sort_values( ["example_chr", "pattern_start_rel"]) dfp.to_csv(out_path, compression='gzip', sep='\t', header=False, index=False)