Example #1
0
def log_gin_config(output_dir,
                   cometml_experiment=None,
                   wandb_run=None,
                   prefix=''):
    """Save the config.gin file containing the whole config, convert it
    to a dictionary and upload it to cometml and wandb.
    """
    gin_config_str = gin.operative_config_str()

    print("Used config: " + "-" * 40)
    print(gin_config_str)
    print("-" * 52)
    with open(os.path.join(output_dir, f"{prefix}config.gin"), "w") as f:
        f.write(gin_config_str)

    gin_config_dict = gin2dict(gin_config_str)
    write_json(gin_config_dict,
               os.path.join(output_dir, f"{prefix}config.gin.json"),
               sort_keys=True,
               indent=2)

    if cometml_experiment is not None:
        # Skip any rows starting with import
        cometml_experiment.log_parameters(gin_config_dict)

    if wandb_run is not None:
        # This allows to display the metric on the dashboard
        wandb_run.config.update(
            {k.replace(".", "/"): v
             for k, v in gin_config_dict.items()})
Example #2
0
    def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=[], save=True, **kwargs):
        """Evaluate the model on the validation set
        Args:
          metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s)
          batch_size:
          num_workers:
          eval_train: if True, also compute the evaluation metrics on the training set
          save: save the json file to the output directory
        """
        if len(kwargs) > 0:
            logger.warn(f"Extra kwargs were provided to trainer.evaluate(): {kwargs}")
        # Save the complete model -> HACK
        self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl'))

        # contruct a list of dataset to evaluate
        if eval_train:
            eval_datasets = [('train', self.train_dataset)] + self.valid_dataset
        else:
            eval_datasets = self.valid_dataset

        # skip some datasets for evaluation
        try:
            if len(eval_skip) > 0:
                logger.info(f"Using eval_skip: {eval_skip}")
                eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip]
        except Exception:
            logger.warn(f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}")

        metric_res = OrderedDict()
        for d in eval_datasets:
            if len(d) == 2:
                dataset_name, dataset = d
                eval_metric = None  # Ignore the provided metric
            elif len(d) == 3:
                # specialized evaluation metric was passed
                dataset_name, dataset, eval_metric = d
            else:
                raise ValueError("Valid dataset needs to be a list of tuples of 2 or 3 elements"
                                 "(name, dataset) or (name, dataset, metric)")
            logger.info(f"Evaluating dataset: {dataset_name}")
            metric_res[dataset_name] = self.seq_model.evaluate(dataset,
                                                               eval_metric=eval_metric,
                                                               num_workers=num_workers,
                                                               batch_size=batch_size)
        if save:
            write_json(metric_res, self.evaluation_path, indent=2)
            logger.info("Saved metrics to {}".format(self.evaluation_path))

        if self.cometml_experiment is not None:
            self.cometml_experiment.log_metrics(flatten(metric_res, separator='/'), prefix="eval/")

        if self.wandb_run is not None:
            self.wandb_run.summary.update(flatten(dict_prefix_key(metric_res, prefix="eval/"), separator='/'))

        return metric_res
Example #3
0
def bpnet_train(dataspec,
                output_dir,
                premade='bpnet9',
                config=None,
                override='',
                gpu=0,
                memfrac_gpu=0.45,
                num_workers=8,
                vmtouch=False,
                in_memory=False,
                wandb_project="",
                cometml_project="",
                run_id=None,
                note_params="",
                overwrite=False):
    """Train a model using gin-config

    Output files:
      train.log - log file
      model.h5 - Keras model HDF5 file
      seqmodel.pkl - Serialized SeqModel. This is the main trained model.
      eval-report.ipynb/.html - evaluation report containing training loss curves and some example model predictions.
        You can specify your own ipynb using `--override='report_template.name="my-template.ipynb"'`.
      model.gin -> copied from the input
      dataspec.yaml -> copied from the input
    """
    cometml_experiment, wandb_run, output_dir = start_experiment(
        output_dir=output_dir,
        cometml_project=cometml_project,
        wandb_project=wandb_project,
        run_id=run_id,
        note_params=note_params,
        overwrite=overwrite)
    # remember the executed command
    write_json(
        {
            "dataspec": dataspec,
            "output_dir": output_dir,
            "premade": premade,
            "config": config,
            "override": override,
            "gpu": gpu,
            "memfrac_gpu": memfrac_gpu,
            "num_workers": num_workers,
            "vmtouch": vmtouch,
            "in_memory": in_memory,
            "wandb_project": wandb_project,
            "cometml_project": cometml_project,
            "run_id": run_id,
            "note_params": note_params,
            "overwrite": overwrite
        },
        os.path.join(output_dir, 'bpnet-train.kwargs.json'),
        indent=2)

    # copy dataspec.yml and input config file over
    if config is not None:
        shutil.copyfile(config, os.path.join(output_dir, 'input-config.gin'))

    # parse and validate the dataspec
    ds = DataSpec.load(dataspec)
    related_dump_yaml(ds.abspath(), os.path.join(output_dir, 'dataspec.yml'))
    if vmtouch:
        if shutil.which('vmtouch') is None:
            logger.warn(
                "vmtouch is currently not installed. "
                "--vmtouch disabled. Please install vmtouch to enable it")
        else:
            # use vmtouch to load all file to memory
            ds.touch_all_files()

    # --------------------------------------------
    # Parse the config file
    # import gin.tf
    if gpu is not None:
        logger.info(f"Using gpu: {gpu}, memory fraction: {memfrac_gpu}")
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu)

    gin_files = _get_gin_files(premade, config)

    # infer differnet hyper-parameters from the dataspec file
    if len(ds.bias_specs) > 0:
        use_bias = True
        if len(ds.bias_specs) > 1:
            # TODO - allow multiple bias track
            # - split the heads separately
            raise ValueError("Only a single bias track is currently supported")

        bias = [v for k, v in ds.bias_specs.items()][0]
        n_bias_tracks = len(bias.tracks)
    else:
        use_bias = False
        n_bias_tracks = 0
    tasks = list(ds.task_specs)
    # TODO - handle multiple track widths?
    tracks_per_task = [len(v.tracks) for k, v in ds.task_specs.items()][0]
    # figure out the right hyper-parameters
    dataspec_bindings = [
        f'dataspec="{dataspec}"', f'use_bias={use_bias}',
        f'n_bias_tracks={n_bias_tracks}', f'tracks_per_task={tracks_per_task}',
        f'tasks={tasks}'
    ]

    gin.parse_config_files_and_bindings(
        gin_files,
        bindings=dataspec_bindings + override.split(";"),
        # NOTE: custom files were inserted right after
        # ther user's config file and before the `override`
        # parameters specified at the command-line
        # This allows the user to disable the bias correction
        # despite being specified in the config file
        skip_unknown=False)

    # --------------------------------------------
    # Remember the parsed configs

    # comet - log environment
    if cometml_experiment is not None:
        # log other parameters
        cometml_experiment.log_parameters(dict(premade=premade,
                                               config=config,
                                               override=override,
                                               gin_files=gin_files,
                                               gpu=gpu),
                                          prefix='cli/')

    # wandb - log environment
    if wandb_run is not None:

        # store general configs
        wandb_run.config.update(
            dict_prefix_key(dict(premade=premade,
                                 config=config,
                                 override=override,
                                 gin_files=gin_files,
                                 gpu=gpu),
                            prefix='cli/'))

    return train(
        output_dir=output_dir,
        cometml_experiment=cometml_experiment,
        wandb_run=wandb_run,
        num_workers=num_workers,
        in_memory=in_memory,
        # to execute the sub-notebook
        memfrac_gpu=memfrac_gpu,
        gpu=gpu)
Example #4
0
def start_experiment(output_dir,
                     cometml_project="",
                     wandb_project="",
                     run_id=None,
                     note_params="",
                     extra_kwargs=None,
                     overwrite=False):
    """Start a model training experiment. This will create a new output directory
    and setup the experiment management handles
    """
    sys.path.append(os.getcwd())
    if cometml_project:
        logger.info("Using comet.ml")
        if Experiment is None:
            raise ImportError("Comet.ml could not be imported")
        workspace, project_name = cometml_project.split("/")
        cometml_experiment = Experiment(project_name=project_name,
                                        workspace=workspace)
        # TODO - get the experiment id
        # specify output_dir to that directory
    else:
        cometml_experiment = None

    if wandb_project:
        assert "/" in wandb_project
        entity, project = wandb_project.split("/")
        if wandb is None:
            logger.warn("wandb not installed. Not using it")
            wandb_run = None
        else:
            logger.info("Using wandb. Running wandb.init()")
            wandb._set_stage_dir("./")  # Don't prepend wandb to output file
            if run_id is not None:
                wandb.init(project=project,
                           dir=output_dir,
                           entity=entity,
                           reinit=True,
                           resume=run_id)
            else:
                # automatically set the output
                wandb.init(project=project,
                           entity=entity,
                           reinit=True,
                           dir=output_dir)
            wandb_run = wandb.run
            if wandb_run is None:
                logger.warn("Wandb run is None")
            print(wandb_run)
    else:
        wandb_run = None

    # update the output directory
    if run_id is None:
        if wandb_run is not None:
            run_id = os.path.basename(wandb_run.dir)
        elif cometml_experiment is not None:
            run_id = cometml_experiment.id
        else:
            # random run_id
            run_id = str(uuid4())
    output_dir = os.path.join(output_dir, run_id)

    if wandb_run is not None:
        # make sure the output directory is the same
        # wandb_run._dir = os.path.normpath(output_dir)  # This doesn't work
        # assert os.path.normpath(wandb_run.dir) == os.path.normpath(output_dir)
        # TODO - fix this assertion-> the output directories should be the same
        # in order for snakemake to work correctly
        pass
    # -----------------------------

    if os.path.exists(os.path.join(output_dir, 'config.gin')):
        if overwrite:
            logger.info(
                f"config.gin already exists in the output "
                "directory {output_dir}. Removing the whole directory.")
            shutil.rmtree(output_dir)
        else:
            raise ValueError(f"Output directory {output_dir} shouldn't exist!")
    os.makedirs(output_dir,
                exist_ok=True)  # make the output directory. It shouldn't exist

    # add logging to the file
    add_file_logging(output_dir, logger)

    # write note_params.json
    if note_params:
        logger.info(f"note_params: {note_params}")
        note_params_dict = kv_string2dict(note_params)
    else:
        note_params_dict = dict()
    write_json(note_params_dict,
               os.path.join(output_dir, "note_params.json"),
               sort_keys=True,
               indent=2)

    if cometml_experiment is not None:
        cometml_experiment.log_parameters(note_params_dict)
        cometml_experiment.log_parameters(dict(output_dir=output_dir),
                                          prefix='cli/')

        exp_url = f"https://www.comet.ml/{cometml_experiment.workspace}/{cometml_experiment.project_name}/{cometml_experiment.id}"
        logger.info("Comet.ml url: " + exp_url)
        # write the information about comet.ml experiment
        write_json(
            {
                "url": exp_url,
                "key": cometml_experiment.id,
                "project": cometml_experiment.project_name,
                "workspace": cometml_experiment.workspace
            },
            os.path.join(output_dir, "cometml.json"),
            sort_keys=True,
            indent=2)

    if wandb_run is not None:
        wandb_run.config.update(note_params_dict)
        write_json(
            {
                "url": wandb_run.get_url(),
                "key": wandb_run.id,
                "project": wandb_run.project,
                "path": wandb_run.path,
                "group": wandb_run.group
            },
            os.path.join(output_dir, "wandb.json"),
            sort_keys=True,
            indent=2)
        wandb_run.config.update(
            dict_prefix_key(dict(output_dir=output_dir), prefix='cli/'))

    return cometml_experiment, wandb_run, output_dir
Example #5
0
def cwm_scan(modisco_dir,
             output_file,
             trim_frac=0.08,
             patterns='all',
             filters='match_weighted_p>=.2,contrib_weighted_p>=.01',
             contrib_file=None,
             add_profile_features=False,
             num_workers=10):
    """Get motif instances via CWM scanning.
    """
    from bpnet.modisco.utils import longer_pattern, shorten_pattern
    from bpnet.modisco.pattern_instances import annotate_profile_single
    add_file_logging(os.path.dirname(output_file), logger, 'cwm-scan')
    modisco_dir = Path(modisco_dir)

    valid_suffixes = [
        '.csv',
        '.csv.gz',
        '.tsv',
        '.tsv.gz',
        '.parq',
        '.bed',
        '.bed.gz',
    ]
    if not any([output_file.endswith(suffix) for suffix in valid_suffixes]):
        raise ValueError(
            f"output_file doesn't have a valid file suffix. Valid file suffixes are: {valid_suffixes}"
        )

    # Centroid matches path
    cm_path = modisco_dir / f'cwm-scan-seqlets.trim-frac={trim_frac:.2f}.csv.gz'

    # save the hyper-parameters
    kwargs_json_file = os.path.join(os.path.dirname(output_file),
                                    'cwm-scan.kwargs.json')
    write_json(
        dict(modisco_dir=os.path.abspath(str(contrib_file)),
             output_file=str(output_file),
             cwm_scan_seqlets_path=str(cm_path),
             trim_frac=trim_frac,
             patterns=patterns,
             filters=filters,
             contrib_file=contrib_file,
             add_profile_features=add_profile_features,
             num_workers=num_workers), str(kwargs_json_file))

    # figure out contrib_wildcard
    modisco_kwargs = read_json(
        os.path.join(modisco_dir, "modisco-run.kwargs.json"))
    contrib_type = load_contrib_type(modisco_kwargs)

    mf = ModiscoFile(modisco_dir / "modisco.h5")
    tasks = mf.tasks()
    # HACK prune the tasks of contribution (in case it's present)
    tasks = [t.split("/")[0] for t in tasks]

    logger.info(f"Using tasks: {tasks}")

    if contrib_file is None:
        cf = ContribFile.from_modisco_dir(modisco_dir)
        cf.cache(
        )  # cache it since it can be re-used in `modisco_centroid_seqlet_matches`
    else:
        logger.info(f"Loading the contribution scores from: {contrib_file}")
        cf = ContribFile(contrib_file, default_contrib_score=contrib_type)

    if not cm_path.exists():
        logger.info(f"Generating centroid matches to {cm_path.resolve()}")
        cwm_scan_seqlets(modisco_dir,
                         output_file=cm_path,
                         trim_frac=trim_frac,
                         contribsf=cf if contrib_file is None else None,
                         num_workers=num_workers,
                         verbose=False)
    else:
        logger.info("Centroid matches already exist.")
    logger.info(f"Loading centroid matches from {cm_path.resolve()}")
    dfm_norm = pd.read_csv(cm_path)

    # get the raw data
    seq, contrib, ranges = cf.get_seq(), cf.get_contrib(), cf.get_ranges()

    logger.info("Scanning for patterns")
    dfl = []

    # patterns to scan. `longer_pattern` makes sure the patterns are in the long format
    scan_patterns = patterns.split(
        ",") if patterns is not 'all' else mf.pattern_names()
    scan_patterns = [longer_pattern(pn) for pn in scan_patterns]

    if add_profile_features:
        profile = cf.get_profiles()
        logger.info("Profile features will also be added to dfi")

    for pattern_name in tqdm(mf.pattern_names()):
        if pattern_name not in scan_patterns:
            # skip scanning that patterns
            continue
        pattern = mf.get_pattern(pattern_name).trim_seq_ic(trim_frac)
        match, contribution = pattern.scan_contribution(contrib,
                                                        hyp_contrib=None,
                                                        tasks=tasks,
                                                        n_jobs=num_workers,
                                                        verbose=False)
        seq_match = pattern.scan_seq(seq, n_jobs=num_workers, verbose=False)
        dfm = pattern.get_instances(
            tasks,
            match,
            contribution,
            seq_match,
            norm_df=dfm_norm[dfm_norm.pattern == pattern_name],
            verbose=False,
            plot=False)
        for filt in filters.split(","):
            if len(filt) > 0:
                dfm = dfm.query(filt)

        if add_profile_features:
            dfm = annotate_profile_single(dfm,
                                          pattern_name,
                                          mf,
                                          profile,
                                          profile_width=70,
                                          trim_frac=trim_frac)
        dfm['pattern_short'] = shorten_pattern(pattern_name)

        # TODO - is it possible to write out the results incrementally?
        dfl.append(dfm)

    logger.info("Merging")
    # merge and write the results
    dfp = pd.concat(dfl)

    # append the ranges
    logger.info("Append ranges")
    ranges.columns = ["example_" + v for v in ranges.columns]
    dfp = dfp.merge(ranges, on="example_idx", how='left')

    # add the absolute coordinates
    dfp['pattern_start_abs'] = dfp['example_start'] + dfp['pattern_start']
    dfp['pattern_end_abs'] = dfp['example_start'] + dfp['pattern_end']

    logger.info("Table info")
    dfp.info()
    logger.info(
        f"Writing the resuling pd.DataFrame of shape {dfp.shape} to {output_file}"
    )

    # set the first 7 columns to comply to bed6 format (chrom, start, end, name, score, strand, ...)
    bed_columns = [
        'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'pattern',
        'contrib_weighted_p', 'strand', 'match_weighted_p'
    ]
    dfp = pd_first_cols(dfp, bed_columns)

    # write to a parquet file
    if output_file.endswith(".parq"):
        logger.info("Writing a parquet file")
        dfp.to_parquet(output_file,
                       partition_on=['pattern_short'],
                       engine='fastparquet')
    elif output_file.endswith(".csv.gz") or output_file.endswith(".csv"):
        logger.info("Writing a csv file")
        dfp.to_csv(output_file, compression='infer', index=False)
    elif output_file.endswith(".tsv.gz") or output_file.endswith(".tsv"):
        logger.info("Writing a tsv file")
        dfp.to_csv(output_file, sep='\t', compression='infer', index=False)
    elif output_file.endswith(".bed.gz") or output_file.endswith(".bed"):
        logger.info("Writing a BED file")
        # write only the first (and main) 7 columns
        dfp[bed_columns].to_csv(output_file,
                                sep='\t',
                                compression='infer',
                                index=False,
                                header=False)
    else:
        logger.warn("File suffix not recognized. Using .csv.gz file format")
        dfp.to_csv(output_file, compression='gzip', index=False)
    logger.info("Done!")
Example #6
0
def bpnet_modisco_run(
    contrib_file,
    output_dir,
    null_contrib_file=None,
    premade='modisco-50k',
    config=None,
    override='',
    contrib_wildcard="*/profile/wn",  # on which contribution scores to run modisco
    only_task_regions=False,
    filter_npy=None,
    exclude_chr="",
    num_workers=10,
    gpu=None,  # no need to use a gpu by default
    memfrac_gpu=0.45,
    overwrite=False,
):
    """Run TF-MoDISco on the contribution scores stored in the contribution score file
    generated by `bpnet contrib`.
    """
    add_file_logging(output_dir, logger, 'modisco-run')
    if gpu is not None:
        logger.info(f"Using gpu: {gpu}, memory fraction: {memfrac_gpu}")
        create_tf_session(gpu, per_process_gpu_memory_fraction=memfrac_gpu)
    else:
        # Don't use any GPU's
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        os.environ['MKL_THREADING_LAYER'] = 'GNU'

    import modisco
    assert '/' in contrib_wildcard

    if filter_npy is not None:
        filter_npy = os.path.abspath(str(filter_npy))
    if config is not None:
        config = os.path.abspath(str(config))

    # setup output file paths
    output_path = os.path.abspath(os.path.join(output_dir, "modisco.h5"))
    remove_exists(output_path, overwrite=overwrite)
    output_filter_npy = os.path.abspath(
        os.path.join(output_dir, 'modisco-run.subset-contrib-file.npy'))
    remove_exists(output_filter_npy, overwrite=overwrite)
    kwargs_json_file = os.path.join(output_dir, "modisco-run.kwargs.json")
    remove_exists(kwargs_json_file, overwrite=overwrite)
    if config is not None:
        config_output_file = os.path.join(output_dir,
                                          'modisco-run.input-config.gin')
        remove_exists(config_output_file, overwrite=overwrite)
        shutil.copyfile(config, config_output_file)

    # save the hyper-parameters
    write_json(
        dict(contrib_file=os.path.abspath(contrib_file),
             output_dir=str(output_dir),
             null_contrib_file=null_contrib_file,
             config=str(config),
             override=override,
             contrib_wildcard=contrib_wildcard,
             only_task_regions=only_task_regions,
             filter_npy=str(filter_npy),
             exclude_chr=exclude_chr,
             num_workers=num_workers,
             overwrite=overwrite,
             output_filter_npy=output_filter_npy,
             gpu=gpu,
             memfrac_gpu=memfrac_gpu), kwargs_json_file)

    # setup the gin config using premade, config and override
    cli_bindings = [f'num_workers={num_workers}']
    gin.parse_config_files_and_bindings(
        _get_gin_files(premade, config),
        bindings=cli_bindings + override.split(";"),
        # NOTE: custom files were inserted right after
        # ther user's config file and before the `override`
        # parameters specified at the command-line
        skip_unknown=False)
    log_gin_config(output_dir, prefix='modisco-run.')
    # --------------------------------------------

    # load the contribution file
    logger.info(f"Loading the contribution file: {contrib_file}")
    cf = ContribFile(contrib_file)
    tasks = cf.get_tasks()

    # figure out subset_tasks
    subset_tasks = set()
    for w in contrib_wildcard.split(","):
        task, head, head_summary = w.split("/")
        if task == '*':
            subset_tasks = None
        else:
            if task not in tasks:
                raise ValueError(f"task {task} not found in tasks: {tasks}")
            subset_tasks.add(task)
    if subset_tasks is not None:
        subset_tasks = list(subset_tasks)

    # --------------------------------------------
    # subset the intervals
    logger.info(f"Loading ranges")
    ranges = cf.get_ranges()
    # include all samples at the beginning
    include_samples = np.ones(len(cf)).astype(bool)

    # --only-task-regions
    if only_task_regions:
        if subset_tasks is None:
            logger.warn(
                "contrib_wildcard contains all tasks (specified by */<head>/<summary>). Not using --only-task-regions"
            )
        elif np.all(ranges['interval_from_task'] == ''):
            raise ValueError(
                "Contribution file wasn't created from multiple set of peaks. "
                "E.g. interval_from_task='' for all ranges. Please disable --only-task-regions"
            )
        else:
            logger.info(f"Subsetting ranges according to `interval_from_task`")
            include_samples = include_samples & ranges[
                'interval_from_task'].isin(subset_tasks).values
            logger.info(
                f"Using {include_samples.sum()} / {len(include_samples)} regions after --only-task-regions subset"
            )

    # --exclude-chr
    if exclude_chr:
        logger.info(f"Excluding chromosomes: {exclude_chr}")
        chromosomes = ranges['chr']
        include_samples = include_samples & (
            ~pd.Series(chromosomes).isin(exclude_chr)).values
        logger.info(
            f"Using {include_samples.sum()} / {len(include_samples)} regions after --exclude-chr subset"
        )

    # -- filter-npy
    if filter_npy is not None:
        print(f"Loading a filter file from {filter_npy}")
        include_samples = include_samples & np.load(filter_npy)
        logger.info(
            f"Using {include_samples.sum()} / {len(include_samples)} regions after --filter-npy subset"
        )

    # store the subset-contrib-file.npy
    logger.info(
        f"Saving the included samples from ContribFile to {output_filter_npy}")
    np.save(output_filter_npy, include_samples)
    # --------------------------------------------

    # convert to indices
    idx = np.arange(len(include_samples))[include_samples]
    seqs = cf.get_seq(idx=idx)

    # fetch the contribution scores from the importance score file
    # expand * to use all possible values
    # TODO - allow this to be done also for all the heads?
    hyp_contrib = {}
    task_names = []
    for w in contrib_wildcard.split(","):
        wc_task, head, head_summary = w.split("/")
        if task == '*':
            use_tasks = tasks
        else:
            use_tasks = [wc_task]
        for task in use_tasks:
            key = f"{task}/{head}/{head_summary}"
            task_names.append(key)
            hyp_contrib[key] = cf._subset(cf.data[f'/hyp_contrib/{key}'],
                                          idx=idx)
    contrib = {k: v * seqs for k, v in hyp_contrib.items()}

    if null_contrib_file is not None:
        logger.info(f"Using null-contrib-file: {null_contrib_file}")
        null_cf = ContribFile(null_contrib_file)
        null_seqs = null_cf.get_seq()
        null_per_pos_scores = {
            key: null_seqs * null_cf.data[f'/hyp_contrib/{key}'][:]
            for key in task_names
        }
    else:
        # default Null distribution. Requires modisco 5.0
        logger.info(f"Using default null_contrib_scores")
        null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(
            num_to_samp=10000)

    # run modisco.
    # NOTE: `workflow` and `report` parameters are provided by gin config files
    modisco_run(task_names=task_names,
                output_path=output_path,
                contrib_scores=contrib,
                hypothetical_contribs=hyp_contrib,
                one_hot=seqs,
                null_per_pos_scores=null_per_pos_scores)

    logger.info(
        f"bpnet modisco-run finished. modisco.h5 and other files can be found in: {output_dir}"
    )