Python HDF5BatchWriter Examples

Programming Language: Python

Namespace/Package Name: kipoi.writers

Class/Type: HDF5BatchWriter

Examples at hotexamples.com: 8

Python HDF5BatchWriter - 8 examples found. These are the top rated real world Python examples of kipoi.writers.HDF5BatchWriter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HDF5BatchWriter(7)

dump(1)

Frequently Used Methods

HDF5BatchWriter (7)

dump (1)

Example #1

Show file

def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir):
    tmpfile = str(tmpdir.mkdir("example").join("out.h5"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = HDF5BatchWriter(tmpfile, chunk_size=4)

    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    with HDF5Reader(tmpfile) as f:
        assert np.all(
            list(f.batch_iter(2))[0]['metadata']['gene_id'] ==
            dl_batch['metadata']['gene_id'][:2])
        out = f.load_all()
        assert np.all(out['metadata']['gene_id'] == np.concatenate([
            dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id']
        ]))
        assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([
            dl_batch['metadata']['ranges']['chr'], dl_batch['metadata']
            ['ranges']['chr']
        ]))
        assert np.all(out['metadata']['ranges']["start"] == np.concatenate([
            dl_batch['metadata']['ranges']['start'], dl_batch['metadata']
            ['ranges']['start']
        ]))
        assert np.all(out['preds'][:3] == pred_batch_array)

Example #2

Show file

File: data.py Project: mmtrebuchet/bpnet-manuscript

 def save(self, file_path, **kwargs):
     """Save the dataset to an hdf5 file
     """
     obj = HDF5BatchWriter(file_path=file_path, **kwargs)
     obj.batch_write(self.data)
     # Store the attrs
     for k,v in self.attrs.items():
         obj.f.attrs[k] = v
     obj.close()

Example #3

Show file

def test_MultipleBatchWriter(dl_batch, pred_batch_array, tmpdir):
    tmpdir = tmpdir.mkdir("example")
    h5_tmpfile = str(tmpdir.join("out.h5"))
    tsv_tmpfile = str(tmpdir.join("out.tsv"))
    batch = prepare_batch(dl_batch, pred_batch_array)
    writer = MultipleBatchWriter(
        [TsvBatchWriter(tsv_tmpfile),
         HDF5BatchWriter(h5_tmpfile)])
    writer.batch_write(batch)
    writer.batch_write(batch)
    writer.close()
    assert os.path.exists(h5_tmpfile)
    assert os.path.exists(tsv_tmpfile)
    df = pd.read_csv(tsv_tmpfile, sep="\t")
    assert set(list(df.columns)) == {
        'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr',
        'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id',
        'preds/0', 'preds/1', 'preds/2'
    }
    assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]

Example #4

Show file

File: contrib.py Project: liesaweigert/bpnet

def bpnet_contrib(
        model_dir,
        output_file,
        method="grad",
        dataspec=None,
        regions=None,
        fasta_file=None,  # alternative to dataspec
        shuffle_seq=False,
        shuffle_regions=False,
        max_regions=None,
        # reference='zeroes', # Currently the only option
        # peak_width=1000,  # automatically inferred from 'config.gin.json'
        # seq_width=None,
        contrib_wildcard='*/profile/wn,*/counts/pre-act',  # specifies which contrib. scores to compute
        batch_size=512,
        gpu=0,
        memfrac_gpu=0.45,
        num_workers=10,
        storage_chunk_size=512,
        exclude_chr='',
        include_chr='',
        overwrite=False,
        skip_bias=False):
    """Run contribution scores for a BPNet model
    """
    from bpnet.extractors import _chrom_sizes
    add_file_logging(os.path.dirname(output_file), logger, 'bpnet-contrib')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(
                f"File exists {output_file}. Use overwrite=True to overwrite it"
            )

    config = read_json(os.path.join(model_dir, 'config.gin.json'))
    seq_width = config['seq_width']
    peak_width = config['seq_width']

    # NOTE - seq_width has to be the same for the input and the target
    #
    # infer from the command line
    # if seq_width is None:
    #     logger.info("Using seq_width = peak_width")
    #     seq_width = peak_width

    # # make sure these are int's
    # seq_width = int(seq_width)
    # peak_width = int(peak_width)

    # Split
    contrib_wildcards = contrib_wildcard.split(",")

    # Allow chr inclusion / exclusion
    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = None
    if include_chr:
        include_chr = include_chr.split(",")
    else:
        include_chr = None

    logger.info("Loading the config files")
    model_dir = Path(model_dir)

    logger.info("Creating the dataset")
    from bpnet.datasets import StrandedProfile, SeqClassification
    if fasta_file is not None:
        if regions is None:
            raise ValueError(
                "fasta_file specified. Expecting regions to be specified as well"
            )
        dl_valid = SeqClassification(
            fasta_file=fasta_file,
            intervals_file=regions,
            incl_chromosomes=include_chr,
            excl_chromosomes=exclude_chr,
            auto_resize_len=seq_width,
        )
        chrom_sizes = _chrom_sizes(fasta_file)
    else:
        if dataspec is None:
            logger.info("Using dataspec used to train the model")
            # Specify dataspec
            dataspec = model_dir / "dataspec.yml"

        ds = DataSpec.load(dataspec)
        dl_valid = StrandedProfile(ds,
                                   incl_chromosomes=include_chr,
                                   excl_chromosomes=exclude_chr,
                                   intervals_file=regions,
                                   peak_width=peak_width,
                                   shuffle=False,
                                   seq_width=seq_width)
        chrom_sizes = _chrom_sizes(ds.fasta_file)

    # Setup contribution score trimming (not required currently)
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        logger.info("Trimming the output")
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")

    seqmodel = SeqModel.from_mdir(model_dir)

    # get all possible interpretation names
    # make sure they match the specified glob
    intp_names = [
        name for name, _ in seqmodel.get_intp_tensors(preact_only=False)
        if fnmatch_any(name, contrib_wildcards)
    ]
    logger.info(f"Using the following interpretation targets:")
    for n in intp_names:
        print(n)

    if max_regions is not None:
        if len(dl_valid) > max_regions:
            logging.info(
                f"Using {max_regions} regions instead of the original {len(dl_valid)}"
            )
        else:
            logging.info(
                f"--max-regions={max_regions} is larger than the dataset size: {len(dl_valid)}. "
                "Using the dataset size for max-regions")
            max_regions = len(dl_valid)
    else:
        max_regions = len(dl_valid)

    max_batches = np.ceil(max_regions / batch_size)

    writer = HDF5BatchWriter(output_file, chunk_size=storage_chunk_size)
    for i, batch in enumerate(
            tqdm(dl_valid.batch_iter(batch_size=batch_size,
                                     shuffle=shuffle_regions,
                                     num_workers=num_workers),
                 total=max_batches)):
        # store the original batch containing 'inputs' and 'targets'
        if skip_bias:
            batch['inputs'] = {
                'seq': batch['inputs']['seq']
            }  # ignore all other inputs

        if max_batches > 0:
            if i > max_batches:
                break

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            batch['inputs']['seq'] = onehot_dinucl_shuffle(
                batch['inputs']['seq'])

        for name in intp_names:
            hyp_contrib = seqmodel.contrib_score(
                batch['inputs']['seq'],
                name=name,
                method=method,
                batch_size=None)  # don't second-batch

            # put contribution scores to the dictionary
            # also trim the contribution scores appropriately so that
            # the output will always be w.r.t. the peak center
            batch[f"/hyp_contrib/{name}"] = hyp_contrib[:, trim_start:trim_end]

        # trim the sequence as well
        # Trim the sequence
        batch['inputs']['seq'] = batch['inputs']['seq'][:, trim_start:trim_end]

        # ? maybe it would it be better to have an explicit ContribFileWriter.
        # that way the written schema would be fixed
        writer.batch_write(batch)

    # add chromosome sizes
    writer.f.attrs['chrom_sizes'] = json.dumps(chrom_sizes)
    writer.close()
    logger.info(f"Done. Contribution score file was saved to: {output_file}")

Example #5

Show file

    model = get_model("DeepSEA/variantEffects")
    dl_kwargs = {'fasta_file': fasta_file, 'num_chr_fasta': True}
    output_dir = Path(args.output_dir)
    output_name = os.path.basename(args.vcf).split('.vcf')[0]

    if args.writer == "zarr":
        from kipoi.writers import ZarrBatchWriter, AsyncBatchWriter
        td = output_name + ".zarr"
        writer = SyncBatchWriter(
            AsyncBatchWriter(
                ZarrBatchWriter(str(output_dir / td), chunk_size=1024)))
    elif args.writer == "lmdb":
        td = output_name + ".lmdb"
        writer = SyncBatchWriter(
            AsyncSyncPredictionsWriter(
                LmdbBatchWriter(str(output_dir / td), "DeepSea_veff",
                                274578419865)))
    elif args.writer == "hdf5":
        td = output_name + ".hdf5"
        from kipoi.writers import HDF5BatchWriter
        writer = SyncBatchWriter(HDF5BatchWriter(str(output_dir / td)))

    print("Start predictions..")
    sp.score_variants(model=model,
                      input_vcf=args.vcf,
                      batch_size=16,
                      num_workers=10,
                      dl_args=dl_kwargs,
                      output_writers=writer)

Example #6

Show file

File: imp_score.py Project: kundajelab/bpnet-manuscript

def imp_score(model_dir,
              output_file,
              method="grad",
              split='all',
              batch_size=512,
              num_workers=10,
              h5_chunk_size=512,
              max_batches=-1,
              shuffle_seq=False,
              memfrac=0.45,
              exclude_chr='',
              overwrite=False,
              gpu=None):
    """Run importance scores for a BPNet model

    Args:
      model_dir: path to the model directory
      output_file: output file path (HDF5 format)
      method: which importance scoring method to use ('grad', 'deeplift' or 'ism')
      split: for which dataset split to compute the importance scores
      h5_chunk_size: hdf5 chunk size.
      exclude_chr: comma-separated list of chromosomes to exclude
      overwrite: if True, overwrite the output directory
      gpu (int): which GPU to use locally. If None, GPU is not used
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")

    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = []
    # load the config files
    logger.info("Loading the config files")
    model_dir = Path(model_dir)
    hp = HParams.load(model_dir / "hparams.yaml")
    ds = DataSpec.load(model_dir / "dataspec.yaml")
    tasks = list(ds.task_specs)
    # validate that the correct dataset was used
    if hp.data.name != 'get_StrandedProfile_datasets':
        logger.warn("hp.data.name != 'get_StrandedProfile_datasets'")

    if split == 'valid':
        assert len(exclude_chr) == 0
        incl_chromosomes = hp.data.kwargs['valid_chr']
        excl_chromosomes = None
    elif split == 'test':
        assert len(exclude_chr) == 0
        incl_chromosomes = hp.data.kwargs['test_chr']
        excl_chromosomes = None
    elif split == 'train':
        assert len(exclude_chr) == 0
        incl_chromosomes = None
        excl_chromosomes = hp.data.kwargs['valid_chr'] + hp.data.kwargs['test_chr'] + hp.data.kwargs.get('exclude_chr', [])
    elif split == 'all':
        incl_chromosomes = None
        excl_chromosomes = hp.data.kwargs.get('exclude_chr', []) + exclude_chr
        logger.info("Excluding chromosomes: {excl_chromosomes}")
    else:
        raise ValueError("split needs to be from {train,valid,test,all}")

    logger.info("Creating the dataset")
    from basepair.datasets import StrandedProfile
    seq_len = hp.data.kwargs['peak_width']
    dl_valid = StrandedProfile(ds,
                               incl_chromosomes=incl_chromosomes,
                               excl_chromosomes=excl_chromosomes,
                               peak_width=seq_len,
                               shuffle=False,
                               target_transformer=None)

    bpnet = BPNet.from_mdir(model_dir)

    writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size)
    for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
        if max_batches > 0:
            logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation")
            if i > max_batches:
                break
        # append the bias model predictions
        # (batch['inputs'], batch['targets']) = bm((batch['inputs'], batch['targets']))

        # store the original batch containing 'inputs' and 'targets'
        wdict = batch

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            if 'seq' in batch['inputs']:
                batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq'])
            else:
                batch['inputs'] = onehot_dinucl_shuffle(batch['inputs'])

        # loop through all tasks, pred_summary and strands
        for task_i, task in enumerate(tasks):
            for pred_summary in ['count', 'weighted']:
                # figure out the number of channels
                nstrands = batch['targets'][f'profile/{task}'].shape[-1]
                strand_hash = ["pos", "neg"]

                for strand_i in range(nstrands):
                    hyp_imp = bpnet.imp_score(batch['inputs'],
                                              task=task,
                                              strand=strand_hash[strand_i],
                                              method=method,
                                              pred_summary=pred_summary,
                                              batch_size=None)  # don't second-batch
                    # put importance scores to the dictionary
                    wdict[f"/hyp_imp/{task}/{pred_summary}/{strand_i}"] = hyp_imp
        writer.batch_write(wdict)
    writer.close()

Example #7

Show file

File: imp_score.py Project: kundajelab/bpnet-manuscript

def imp_score_seqmodel(model_dir,
                       output_file,
                       dataspec=None,
                       peak_width=1000,
                       seq_width=None,
                       intp_pattern='*',  # specifies which imp. scores to compute
                       # skip_trim=False,  # skip trimming the output
                       method="deeplift",
                       batch_size=512,
                       max_batches=-1,
                       shuffle_seq=False,
                       memfrac=0.45,
                       num_workers=10,
                       h5_chunk_size=512,
                       exclude_chr='',
                       include_chr='',
                       overwrite=False,
                       skip_bias=False,
                       gpu=None):
    """Run importance scores for a BPNet model

    Args:
      model_dir: path to the model directory
      output_file: output file path (HDF5 format)
      method: which importance scoring method to use ('grad', 'deeplift' or 'ism')
      split: for which dataset split to compute the importance scores
      h5_chunk_size: hdf5 chunk size.
      exclude_chr: comma-separated list of chromosomes to exclude
      overwrite: if True, overwrite the output directory
      skip_bias: if True, don't store the bias tracks in teh output
      gpu (int): which GPU to use locally. If None, GPU is not used
    """
    add_file_logging(os.path.dirname(output_file), logger, 'modisco-score-seqmodel')
    if gpu is not None:
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''

    if os.path.exists(output_file):
        if overwrite:
            os.remove(output_file)
        else:
            raise ValueError(f"File exists {output_file}. Use overwrite=True to overwrite it")

    if seq_width is None:
        logger.info("Using seq_width = peak_width")
        seq_width = peak_width

    # make sure these are int's
    seq_width = int(seq_width)
    peak_width = int(peak_width)

    # Split
    intp_patterns = intp_pattern.split(",")

    # Allow chr inclusion / exclusion
    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = None
    if include_chr:
        include_chr = include_chr.split(",")
    else:
        include_chr = None

    logger.info("Loading the config files")
    model_dir = Path(model_dir)

    if dataspec is None:
        # Specify dataspec
        dataspec = model_dir / "dataspec.yaml"
    ds = DataSpec.load(dataspec)

    logger.info("Creating the dataset")
    from basepair.datasets import StrandedProfile
    dl_valid = StrandedProfile(ds,
                               incl_chromosomes=include_chr,
                               excl_chromosomes=exclude_chr,
                               peak_width=peak_width,
                               seq_width=seq_width,
                               shuffle=False,
                               taskname_first=True,  # Required to work nicely with imp-score
                               target_transformer=None)

    # Setup importance score trimming
    if seq_width > peak_width:
        # Trim
        # make sure we can nicely trim the peak
        logger.info("Trimming the output")
        assert (seq_width - peak_width) % 2 == 0
        trim_start = (seq_width - peak_width) // 2
        trim_end = seq_width - trim_start
        assert trim_end - trim_start == peak_width
    elif seq_width == peak_width:
        trim_start = 0
        trim_end = peak_width
    else:
        raise ValueError("seq_width < peak_width")

    seqmodel = SeqModel.from_mdir(model_dir)

    # get all possible interpretation names
    # make sure they match the specified glob
    intp_names = [name for name, _ in seqmodel.get_intp_tensors(preact_only=False)
                  if fnmatch_any(name, intp_patterns)]
    logger.info(f"Using the following interpretation targets:")
    for n in intp_names:
        print(n)

    writer = HDF5BatchWriter(output_file, chunk_size=h5_chunk_size)
    for i, batch in enumerate(tqdm(dl_valid.batch_iter(batch_size=batch_size, num_workers=num_workers))):
        # store the original batch containing 'inputs' and 'targets'
        wdict = batch
        if skip_bias:
            wdict['inputs'] = {'seq': wdict['inputs']['seq']}  # ignore all other inputs

        if max_batches > 0:
            logging.info(f"max_batches: {max_batches} exceeded. Stopping the computation")
            if i > max_batches:
                break

        if shuffle_seq:
            # Di-nucleotide shuffle the sequences
            batch['inputs']['seq'] = onehot_dinucl_shuffle(batch['inputs']['seq'])

        for name in intp_names:
            hyp_imp = seqmodel.imp_score(batch['inputs']['seq'],
                                         name=name,
                                         method=method,
                                         batch_size=None)  # don't second-batch

            # put importance scores to the dictionary
            # also trim the importance scores appropriately so that
            # the output will always be w.r.t. the peak center
            wdict[f"/hyp_imp/{name}"] = hyp_imp[:, trim_start:trim_end]

        # trim the sequence as well
        if isinstance(wdict['inputs'], dict):
            # Trim the sequence
            wdict['inputs']['seq'] = wdict['inputs']['seq'][:, trim_start:trim_end]
        else:
            wdict['inputs'] = wdict['inputs'][:, trim_start:trim_end]

        writer.batch_write(wdict)
    writer.close()

Example #8

Show file

def modisco_run(
    imp_scores,
    output_dir,
    null_imp_scores=None,
    hparams=None,
    override_hparams="",
    grad_type="weighted",
    subset_tasks=None,
    filter_subset_tasks=False,
    filter_npy=None,
    exclude_chr="",
    seqmodel=False,  # interpretation glob
    # hparams=None,
    num_workers=10,
    max_strand_distance=0.1,
    overwrite=False,
    skip_dist_filter=False,
    use_all_seqlets=False,
    merge_tasks=False,
    gpu=None,
):
    """
    Run modisco

    Args:
      imp_scores: path to the hdf5 file of importance scores
      null_imp_scores: Path to the null importance scores
      grad_type: for which output to compute the importance scores
      hparams: None, modisco hyper - parameeters: either a path to modisco.yaml or
        a ModiscoHParams object
      override_hparams: hyper - parameters overriding the settings in the hparams file
      output_dir: output file directory
      filter_npy: path to a npy file containing a boolean vector used for subsetting
      exclude_chr: comma-separated list of chromosomes to exclude
      seqmodel: If enabled, then the importance scores came from `imp-score-seqmodel`
      subset_tasks: comma-separated list of task names to use as a subset
      filter_subset_tasks: if True, run modisco only in the regions for that TF
      hparams: hyper - parameter file
      summary: which summary statistic to use for the profile gradients
      skip_dist_filter: if True, distances are not used to filter
      use_all_seqlets: if True, don't restrict the number of seqlets
      split: On which data split to compute the results
      merge_task: if True, importance scores for the tasks will be merged
      gpu: which gpu to use. If None, don't use any GPU's

    Note: when using subset_tasks, modisco will run on all the importance scores. If you wish
      to run it only for the importance scores for a particular task you should subset it to
      the peak regions of interest using `filter_npy`
    """
    plt.switch_backend('agg')
    add_file_logging(output_dir, logger, 'modisco-run')
    import os
    if gpu is not None:
        create_tf_session(gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
    os.environ['MKL_THREADING_LAYER'] = 'GNU'
    # import theano
    import modisco
    import modisco.tfmodisco_workflow.workflow

    if seqmodel:
        assert '/' in grad_type

    if subset_tasks == '':
        logger.warn("subset_tasks == ''. Not using subset_tasks")
        subset_tasks = None

    if subset_tasks == 'all':
        # Use all subset tasks e.g. don't subset
        subset_tasks = None

    if subset_tasks is not None:
        subset_tasks = subset_tasks.split(",")
        if len(subset_tasks) == 0:
            raise ValueError("Provide one or more subset_tasks. Found None")

    if filter_subset_tasks and subset_tasks is None:
        print("Using filter_subset_tasks=False since `subset_tasks` is None")
        filter_subset_tasks = False

    if exclude_chr:
        exclude_chr = exclude_chr.split(",")
    else:
        exclude_chr = []

    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    output_path = output_dir / "modisco.h5"
    remove_exists(output_path, overwrite)

    output_distances = output_dir / "strand_distances.h5"
    remove_exists(output_distances, overwrite)

    if filter_npy is not None:
        filter_npy = os.path.abspath(filter_npy)

    # save the hyper-parameters
    write_json(
        dict(
            imp_scores=os.path.abspath(imp_scores),
            grad_type=grad_type,
            output_dir=str(output_dir),
            subset_tasks=subset_tasks,
            filter_subset_tasks=filter_subset_tasks,
            hparams=hparams,
            null_imp_scores=null_imp_scores,
            # TODO - pack into hyper-parameters as well?
            filter_npy=filter_npy,
            exclude_chr=",".join(exclude_chr),
            skip_dist_filter=skip_dist_filter,
            use_all_seqlets=use_all_seqlets,
            max_strand_distance=max_strand_distance,
            gpu=gpu),
        os.path.join(output_dir, "kwargs.json"))

    print("-" * 40)
    # parse the hyper-parameters
    if hparams is None:
        print(f"Using default hyper-parameters")
        hp = ModiscoHParams()
    else:
        if isinstance(hparams, str):
            print(f"Loading hyper-parameters from file: {hparams}")
            hp = ModiscoHParams.load(hparams)
        else:
            assert isinstance(hparams, ModiscoHParams)
            hp = hparams
    if override_hparams:
        print(f"Overriding the following hyper-parameters: {override_hparams}")
    hp = tf.contrib.training.HParams(
        **hp.get_modisco_kwargs()).parse(override_hparams)

    if use_all_seqlets:
        hp.max_seqlets_per_metacluster = None

    # save the hyper-parameters
    print("Using the following hyper-parameters for modisco:")
    print("-" * 40)
    related_dump_yaml(ModiscoHParams(**hp.values()),
                      os.path.join(output_dir, "hparams.yaml"),
                      verbose=True)
    print("-" * 40)

    # TODO - replace with imp_scores
    d = HDF5Reader.load(imp_scores)
    if 'hyp_imp' not in d:
        # backcompatibility
        d['hyp_imp'] = d['grads']

    if seqmodel:
        tasks = list(d['targets'])
    else:
        tasks = list(d['targets']['profile'])

    if subset_tasks is not None:
        # validate that all the `subset_tasks`
        # are present in `tasks`
        for st in subset_tasks:
            if st not in tasks:
                raise ValueError(
                    f"subset task {st} not found in tasks: {tasks}")
        logger.info(
            f"Using the following tasks: {subset_tasks} instead of the original tasks: {tasks}"
        )
        tasks = subset_tasks

    if isinstance(d['inputs'], dict):
        one_hot = d['inputs']['seq']
    else:
        one_hot = d['inputs']

    n = len(one_hot)

    # --------------------
    # apply filters
    if not skip_dist_filter:
        print("Using profile prediction for the strand filtering")
        grad_type_filtered = 'weighted'
        distances = np.array([
            np.array([
                correlation(
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][0][i]),
                    np.ravel(d['hyp_imp'][task][grad_type_filtered][1][i]))
                for i in range(n)
            ]) for task in tasks
            if len(d['hyp_imp'][task][grad_type_filtered]) == 2
        ]).T.mean(axis=-1)  # average the distances across tasks

        dist_filter = distances < max_strand_distance
        print(f"Fraction of sequences kept: {dist_filter.mean()}")

        HDF5BatchWriter.dump(output_distances, {
            "distances": distances,
            "included_samples": dist_filter
        })
    else:
        dist_filter = np.ones((n, ), dtype=bool)

    # add also the filter numpy
    if filter_npy is not None:
        print(f"Loading a filter file from {filter_npy}")
        filter_vec = np.load(filter_npy)
        dist_filter = dist_filter & filter_vec

    if filter_subset_tasks:
        assert subset_tasks is not None
        interval_from_task = pd.Series(d['metadata']['interval_from_task'])
        print(
            f"Subsetting the intervals accoring to subset_tasks: {subset_tasks}"
        )
        print(f"Number of original regions: {dist_filter.sum()}")
        dist_filter = dist_filter & interval_from_task.isin(
            subset_tasks).values
        print(
            f"Number of filtered regions after filter_subset_tasks: {dist_filter.sum()}"
        )

    # filter by chromosome
    if exclude_chr:
        logger.info(f"Excluding chromosomes: {exclude_chr}")
        chromosomes = d['metadata']['range']['chr']
        dist_filter = dist_filter & (
            ~pd.Series(chromosomes).isin(exclude_chr)).values
    # -------------------------------------------------------------
    # setup importance scores

    if seqmodel:
        thr_one_hot = one_hot[dist_filter]
        thr_hypothetical_contribs = {
            f"{task}/{gt}":
            d['hyp_imp'][task][gt.split("/")[0]][gt.split("/")[1]][dist_filter]
            for task in tasks for gt in grad_type.split(",")
        }
        thr_contrib_scores = {
            f"{task}/{gt}":
            thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
            for task in tasks for gt in grad_type.split(",")
        }
        task_names = [
            f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
        ]

    else:
        if merge_tasks:
            thr_one_hot = np.concatenate([
                one_hot[dist_filter] for task in tasks
                for gt in grad_type.split(",")
            ])
            thr_hypothetical_contribs = {
                "merged":
                np.concatenate([
                    mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks
                    for gt in grad_type.split(",")
                ])
            }

            thr_contrib_scores = {
                "merged": thr_hypothetical_contribs['merged'] * thr_one_hot
            }
            task_names = ['merged']
        else:
            thr_one_hot = one_hot[dist_filter]
            thr_hypothetical_contribs = {
                f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[dist_filter]
                for task in tasks for gt in grad_type.split(",")
            }
            thr_contrib_scores = {
                f"{task}/{gt}":
                thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot
                for task in tasks for gt in grad_type.split(",")
            }
            task_names = [
                f"{task}/{gt}" for task in tasks for gt in grad_type.split(",")
            ]

    if null_imp_scores is not None:
        logger.info(f"Using null_imp_scores: {null_imp_scores}")
        null_isf = ImpScoreFile(null_imp_scores)
        null_per_pos_scores = {
            f"{task}/{gt}": v.sum(axis=-1)
            for gt in grad_type.split(",")
            for task, v in null_isf.get_contrib(imp_score=gt).items()
            if task in tasks
        }
    else:
        # default Null distribution. Requires modisco 5.0
        logger.info(f"Using default null_imp_scores")
        null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(
            num_to_samp=10000)

    # -------------------------------------------------------------
    # run modisco
    tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
        # Modisco defaults
        sliding_window_size=hp.sliding_window_size,
        flank_size=hp.flank_size,
        target_seqlet_fdr=hp.target_seqlet_fdr,
        min_passing_windows_frac=hp.min_passing_windows_frac,
        max_passing_windows_frac=hp.max_passing_windows_frac,
        min_metacluster_size=hp.min_metacluster_size,
        max_seqlets_per_metacluster=hp.max_seqlets_per_metacluster,
        seqlets_to_patterns_factory=modisco.tfmodisco_workflow.
        seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=hp.trim_to_window_size,  # default: 30
            initial_flank_to_add=hp.initial_flank_to_add,  # default: 10
            kmer_len=hp.kmer_len,  # default: 8
            num_gaps=hp.num_gaps,  # default: 3
            num_mismatches=hp.num_mismatches,  # default: 2
            n_cores=num_workers,
            final_min_cluster_size=hp.final_min_cluster_size)  # default: 30
    )(
        task_names=task_names,
        contrib_scores=thr_contrib_scores,  # -> task score
        hypothetical_contribs=thr_hypothetical_contribs,
        one_hot=thr_one_hot,
        null_per_pos_scores=null_per_pos_scores)
    # -------------------------------------------------------------
    # save the results
    grp = h5py.File(output_path)
    tfmodisco_results.save_hdf5(grp)