Ejemplo n.º 1
0
 def load(cls, file_path):
     """Load the dataset from an hdf5 dataset
     """
     with HDF5Reader(file_path) as obj:
         data = obj.load_all()
         attrs = OrderedDict(obj.f.attrs)
     return cls(data, attrs)
Ejemplo n.º 2
0
def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.h5"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = HDF5BatchWriter(tmpfile, chunk_size=4)

    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    with HDF5Reader(tmpfile) as f:
        assert np.all(
            list(f.batch_iter(2))[0]['metadata']['gene_id'] ==
            dl_batch['metadata']['gene_id'][:2])
        out = f.load_all()
        assert np.all(out['metadata']['gene_id'] == np.concatenate([
            dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id']
        ]))
        assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([
            dl_batch['metadata']['ranges']['chr'], dl_batch['metadata']
            ['ranges']['chr']
        ]))
        assert np.all(out['metadata']['ranges']["start"] == np.concatenate([
            dl_batch['metadata']['ranges']['start'], dl_batch['metadata']
            ['ranges']['start']
        ]))
        assert np.all(out['preds'][:3] == pred_batch_array)
Ejemplo n.º 3
0
    def __init__(self, fpath):
        self.fpath = fpath
        self.f = HDF5Reader(self.fpath)
        self.f.open()

        # example ranges. loaded when needed
        self.ranges = None
Ejemplo n.º 4
0
def get_eval_predictions(tf, model, filter_dnase=False):
    """Get the predictions"""
    with HDF5Reader(os.path.join(eval_dir, tf, model + ".h5")) as r:
        y_pred = r.f['/preds'][:]

    labels_bed_file = os.path.join(root_dir,
                                   get_dl_kwargs(tf)['intervals_file'])
    df_unfiltered = pd.read_csv(labels_bed_file, sep="\t", header=None)
    df_unfiltered.columns = ['chr', 'start', 'end', 'y_true']
    if filter_dnase:
        # Filter the DNase peaks based on the overlaps
        dnase_peaks = '{ddir}/raw/tfbinding/eval/tf-DREAM/DNASE.{ctype}.relaxed.narrowPeak.gz'.format(
            ddir=ddir, ctype=TF2CT[tf])
        filtered_bed = BedTool(labels_bed_file).intersect(BedTool(dnase_peaks),
                                                          u=True,
                                                          wa=True,
                                                          f=.5)
        df_filtered = pd.read_csv(filtered_bed.fn, sep="\t", header=None)
        df_filtered.columns = ['chr', 'start', 'end', 'y_true']
        df_filtered['filtered'] = True
        keep = df_unfiltered.merge(df_filtered,
                                   how='left',
                                   on=list(
                                       df_unfiltered.columns)).filtered == True
        return df_unfiltered.y_true.values[keep], y_pred[keep]
    else:
        return df_unfiltered.y_true.values, y_pred[:]
Ejemplo n.º 5
0
    def __init__(self, file_path,
                 include_samples=None,
                 default_imp_score='weighted'):
        self.file_path = file_path
        self.f = HDF5Reader(self.file_path)
        self.f.open()

        # use the hdf5 file handle
        self.data = self.f.f

        self.include_samples = include_samples

        self._hyp_contrib_cache = dict()
        self.default_imp_score = default_imp_score
Ejemplo n.º 6
0
def modisco_instances_to_bed(modisco_h5,
                             instances_parq,
                             imp_score_h5,
                             output_dir,
                             trim_frac=0.08):
    from basepair.modisco.pattern_instances import load_instances

    add_file_logging(output_dir, logger, 'modisco-instances-to-bed')
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    mr = ModiscoResult(modisco_h5)
    mr.open()

    print("load task_id")
    d = HDF5Reader(imp_score_h5)
    d.open()
    if 'hyp_imp' not in d.f.keys():
        # backcompatibility
        d['hyp_imp'] = d['grads']

    id_hash = pd.DataFrame({
        "peak_id":
        d.f['/metadata/interval_from_task'][:],
        "example_idx":
        np.arange(d.f['/metadata/interval_from_task'].shape[0])
    })

    # load the instances data frame
    print("load all instances")
    df = load_instances(instances_parq, motifs=None, dedup=True)
    # import pdb
    # pdb.set_trace()
    df = df.merge(id_hash, on="example_idx")  # append peak_id

    patterns = df.pattern.unique().tolist()
    pattern_pssms = {
        pattern: mr.get_pssm(*pattern.split("/"))
        for pattern in patterns
    }
    append_pattern_loc(df, pattern_pssms, trim_frac=trim_frac)

    # write out the results
    example_cols = [
        'example_chr', 'example_start', 'example_end', 'example_id', 'peak_id'
    ]
    df_examples = df[example_cols].drop_duplicates().sort_values(
        ["example_chr", "example_start"])
    df_examples.to_csv(output_dir / "scored_regions.bed",
                       sep='\t',
                       header=False,
                       index=False)

    df["pattern_start_rel"] = df.pattern_start + df.example_start
    df["pattern_end_rel"] = df.pattern_end + df.example_start
    df["strand"] = df.revcomp.astype(bool).map({True: "-", False: "+"})

    # TODO - update this - ?
    pattern_cols = [
        'example_chr', 'pattern_start_rel', 'pattern_end_rel', 'example_id',
        'percnormed_score', 'strand', 'peak_id', 'seqlet_score'
    ]

    (output_dir /
     "README").write_text("score_regions.bed columns: " +
                          ", ".join(example_cols) + "\n" +
                          "metacluster_<>/pattern_<>.bed columns: " +
                          ", ".join(pattern_cols))
    df_pattern = df[pattern_cols]
    for pattern in df.pattern.unique():
        out_path = output_dir / (pattern + ".bed.gz")
        out_path.parent.mkdir(parents=True, exist_ok=True)
        dfp = df_pattern[df.pattern == pattern].drop_duplicates().sort_values(
            ["example_chr", "pattern_start_rel"])
        dfp.to_csv(out_path,
                   compression='gzip',
                   sep='\t',
                   header=False,
                   index=False)