Beispiel #1
0
def modisco_run(
        output_path,  # specified by bpnet_modisco_run
        task_names,
        contrib_scores,
        hypothetical_contribs,
        one_hot,
        null_per_pos_scores,
        # specified by gin-config
        workflow=gin.REQUIRED,  # TfModiscoWorkflow
        report=None):  # reports to use
    """
    Args:
      workflow: TfModiscoWorkflow objects
      report: path to the report ipynb
    """
    import h5py
    modisco_results = workflow(task_names=task_names,
                               contrib_scores=contrib_scores,
                               hypothetical_contribs=hypothetical_contribs,
                               one_hot=one_hot,
                               null_per_pos_scores=null_per_pos_scores)
    # save the results
    logger.info(f"Saving modisco file to {output_path}")
    grp = h5py.File(output_path)
    modisco_results.save_hdf5(grp)
    grp.flush()
    grp.close()

    if report is not None:
        if report is not None:
            report = os.path.abspath(os.path.expanduser(report))
            if not os.path.exists(report):
                raise ValueError(f"Report file {report} doesn't exist")

        logger.info("Running the report")
        # Run the jupyter notebook
        report_path = os.path.join(os.path.dirname(output_path),
                                   os.path.basename(report))
        render_ipynb(report,
                     report_path,
                     params=dict(modisco_file=output_path,
                                 modisco_dir=os.path.dirname(output_path)))
        logger.info(f"Done rendering the report file: {report_path}")
Beispiel #2
0
def ipynb_render(input_ipynb, output_ipynb, params=""):
    from bpnet.utils import render_ipynb, kwargs_str2kwargs
    render_ipynb(input_ipynb, output_ipynb, kwargs_str2kwargs(params))
Beispiel #3
0
def train(
    output_dir,
    model=gin.REQUIRED,
    data=gin.REQUIRED,
    eval_metric=None,
    eval_train=False,
    eval_skip=[],
    trainer_cls=SeqModelTrainer,
    eval_report=None,
    # shared
    batch_size=256,
    # train-specific
    epochs=100,
    early_stop_patience=4,
    train_epoch_frac=1.0,
    valid_epoch_frac=1.0,
    train_samples_per_epoch=None,
    validation_samples=None,
    train_batch_sampler=None,
    stratified_sampler_p=None,
    tensorboard=True,
    seed=None,
    # specified by bpnet_train
    in_memory=False,
    num_workers=8,
    gpu=None,
    memfrac_gpu=None,
    cometml_experiment=None,
    wandb_run=None,
):
    """Main entry point to configure in the gin config

    Args:
      model: compiled keras model
      data: tuple of (train, valid) Datasets
      eval_train: if True, also compute the evaluation metrics for the final model
        on the training set
      eval_report: path to the ipynb report file. Use the default one. If set to empty string, the report will not be generated.
      eval_skip List[str]: datasets to skip during evaluation
      seed: random seed to use (in numpy and tensorflow)
    """
    # from this point on, no configurable should be added. Save the gin config
    log_gin_config(output_dir, cometml_experiment, wandb_run)

    train_dataset, valid_dataset = data[0], data[1]

    if eval_report is not None:
        eval_report = os.path.abspath(os.path.expanduser(eval_report))
        if not os.path.exists(eval_report):
            raise ValueError(f"Evaluation report {eval_report} doesn't exist")

    if seed is not None:
        # Set the random seed
        import random
        random.seed(seed)
        np.random.seed(seed)
        try:
            import tensorflow as tf
            tf.set_random_seed(seed)
        except Exception:
            logger.info("Unable to set random seed for tensorflow")

    # make sure the validation dataset names are unique
    if isinstance(valid_dataset, list):
        dataset_names = []
        for d in valid_dataset:
            dataset_name = d[0]
            if dataset_name in dataset_names:
                raise ValueError("The dataset names are not unique")
            dataset_names.append(dataset_name)

    if stratified_sampler_p is not None and train_batch_sampler is not None:
        raise ValueError(
            "stratified_sampler_p and train_batch_sampler are mutually exclusive."
            " Please specify only one of them.")

    if stratified_sampler_p is not None and train_batch_sampler is None:
        # HACK - there is no guarantee that train_dataset.get_targets() will exist
        # Maybe we have to introduce a ClassificationDataset instead which will
        # always implement get_targets()
        logger.info(
            f"Using stratified samplers with p: {stratified_sampler_p}")
        train_batch_sampler = samplers.StratifiedRandomBatchSampler(
            train_dataset.get_targets().max(axis=1),
            batch_size=batch_size,
            p_vec=stratified_sampler_p,
            verbose=True)

    num_workers_orig = num_workers  # remember the old number of workers before overwriting it
    if in_memory:
        # load the training datasets to memory
        logger.info("Loading the training data into memory")
        train_dataset = NumpyDataset(
            train_dataset.load_all(batch_size=batch_size,
                                   num_workers=num_workers))
        logger.info("Loading the validation data into memory")
        if isinstance(valid_dataset, list):
            # appropriately handle the scenario where multiple
            # validation data may be provided as a list of (name, Dataset) tuples
            valid_dataset = [(k,
                              NumpyDataset(
                                  data.load_all(batch_size=batch_size,
                                                num_workers=num_workers)))
                             for k, data in valid_dataset]
        else:
            # only a single Dataset was provided
            valid_dataset = NumpyDataset(
                valid_dataset.load_all(batch_size=batch_size,
                                       num_workers=num_workers))

        num_workers = 1  # don't use multi-processing any more

    tr = trainer_cls(model, train_dataset, valid_dataset, output_dir,
                     cometml_experiment, wandb_run)

    tr.train(batch_size=batch_size,
             epochs=epochs,
             early_stop_patience=early_stop_patience,
             num_workers=num_workers,
             train_epoch_frac=train_epoch_frac,
             valid_epoch_frac=valid_epoch_frac,
             train_samples_per_epoch=train_samples_per_epoch,
             validation_samples=validation_samples,
             train_batch_sampler=train_batch_sampler,
             tensorboard=tensorboard)
    final_metrics = tr.evaluate(eval_metric,
                                batch_size=batch_size,
                                num_workers=num_workers,
                                eval_train=eval_train,
                                eval_skip=eval_skip,
                                save=True)
    # pass
    logger.info("Done!")
    print("-" * 40)
    print("Final metrics: ")
    print(json.dumps(final_metrics, cls=NumpyAwareJSONEncoder, indent=2))

    if eval_report is not None:
        logger.info("Running the evaluation report")
        # Release the GPU
        K.clear_session()

        # remove memory
        del tr, train_dataset, valid_dataset, data
        gc.collect()

        if num_workers_orig != num_workers:
            # recover the original number of workers
            num_workers = num_workers_orig

        # Run the jupyter notebook
        render_ipynb(eval_report,
                     os.path.join(output_dir, os.path.basename(eval_report)),
                     params=dict(model_dir=os.path.abspath(output_dir),
                                 gpu=gpu,
                                 memfrac_gpu=memfrac_gpu,
                                 in_memory=in_memory,
                                 num_workers=num_workers))

    # upload all files in output_dir to comet.ml
    # Note: wandb does this automatically
    if cometml_experiment is not None:
        logger.info("Uploading files to comet.ml")
        cometml_experiment.log_asset_folder(folder=output_dir)

    logger.info(
        f"Done training and evaluating the model. Model and metrics can be found in: {output_dir}"
    )

    return final_metrics
Beispiel #4
0
def modisco_report(modisco_dir, output_dir):
    render_ipynb(os.path.join(this_path, "../templates/modisco-chip.ipynb"),
                 os.path.join(output_dir, "modisco-chip.ipynb"),
                 params=dict(modisco_dir=modisco_dir))