Exemple #1
0
def test_create_model(rnn):
    got = model.create_model(model.Options(attention=True,
                                           rnn=rnn)).get_config()
    got = json.loads(json.dumps(got))
    tfversion = version.parse(tf.__version__)
    with pathlib.Path(__file__).with_suffix('.json').open('r') as file:
        expected = json.load(file)[f"{tfversion.major}.{tfversion.minor}"]
    assert got == expected[rnn]
Exemple #2
0
    def train(args: argparse.Namespace, options: dgmodel.Options) -> None:
        """Train deepgrp.

        Args:
            args (argparse.Namespace): Command line arguments.
                Includes: 'parameter', 'trainfile', 'validfile', 'bedfile',
                'logdir', 'modelfile'
            options (dgmodel.Options): Default and user provided options.
        """
        with open(args.parameter, "r") as file:
            parameter = dgmodel.Options.from_toml(file)
        parameter.fromdict(options.todict())
        logdir = args.logdir

        # get which chromosome is used for training and which for validation
        train_chr = os.path.basename(args.trainfile).split('.')[0]
        val_chr = os.path.basename(args.validfile).split('.')[0]

        # check if logdir exists?
        if not os.path.isdir(logdir):
            os.mkdir(logdir)

        # load data
        _LOG.info("Loading in all data necessary from %s, %s, %s",
                  args.trainfile, args.validfile, args.bedfile)
        train_fwd = np.load(args.trainfile, allow_pickle=False)['fwd']
        val_fwd = np.load(args.validfile, allow_pickle=False)['fwd']

        # preprocess
        ## preprocess y (bedfile)
        y_train = dgpreprocess.preprocess_y(args.bedfile, train_chr,
                                            train_fwd.shape[1],
                                            parameter.repeats_to_search)
        y_val = dgpreprocess.preprocess_y(args.bedfile, val_chr,
                                          val_fwd.shape[1],
                                          parameter.repeats_to_search)

        ## preprocess training and validation data
        train_fwd, y_train = dgpreprocess.drop_start_end_n(train_fwd, y_train)
        val_fwd, y_val = dgpreprocess.drop_start_end_n(val_fwd, y_val)
        train_data = dgpreprocess.Data(train_fwd, y_train)
        val_data = dgpreprocess.Data(val_fwd, y_val)

        # training
        _LOG.info("Creating model for training")
        model = dgmodel.create_model(parameter)
        _LOG.info("Training Model")
        dgtrain.training((train_data, val_data), parameter, model, logdir)

        # save model in h5 format
        _LOG.info("Saving model as %s", args.modelfile)
        model.save(args.modelfile)
Exemple #3
0
def setup_prediction_from_options_checkpoint(options: Options,
                                             logdir: PathLike) -> keras.Model:
    """Creates a DeepGRP model and loads latest weights from Checkpoint.

    Args:
        options (deepgrp.model.Options): Hyperparameter
        logdir (PathLike): Directory of the checkpoint.

    Returns:
        keras.Model: compiled functional DeepGRP model
                        with restored weights.

    """
    model = create_model(options)

    ckpt = tf.train.Checkpoint()
    manager = tf.train.CheckpointManager(ckpt, logdir, max_to_keep=None)
    model.load_weights(manager.latest_checkpoint).expect_partial()
    return model
Exemple #4
0
def test_predict(monkeypatch, tmpdir, mss_bool):
    # helper functions:
    def dgpred_predict_dummy(model, data_iterator, output_shape, step_size):
        assert isinstance(model, tf.keras.Model)
        assert isinstance(data_iterator, tf.data.Dataset)
        assert isinstance(output_shape[0], int) and isinstance(
            output_shape[1], int)
        assert step_size == 3
        return np.ones((100, 5))

    def apply_mss_dummy(prediction, options):
        assert isinstance(prediction, np.ndarray)
        assert isinstance(options, dgmodel.Options)
        assert mss_bool
        return np.ones((100, 5))

    def softmax_dummy(prediction):
        assert isinstance(prediction, np.ndarray)
        assert not mss_bool
        return np.ones((100, 5))

    monkeypatch.setattr(dgpred, "predict", dgpred_predict_dummy)
    monkeypatch.setattr(dgpred, "apply_mss", apply_mss_dummy)
    monkeypatch.setattr(dgpred, "softmax", softmax_dummy)

    # variables to give for testing
    opt = dgmodel.Options(project_root_dir=str(tmpdir),
                          n_batches=1,
                          n_epochs=1,
                          batch_size=10,
                          vecsize=10)
    dnasequence = "".join(
        np.random.choice(["N", "A", "C", "G", "T"], size=(100)))
    model = dgmodel.create_model(opt)
    dgparser._predict(  # pylint: disable=protected-access
        dnasequence=dnasequence,
        model=model,
        options=opt,
        step_size=3,
        use_mss=mss_bool)
Exemple #5
0
def build_and_optimize(
        train_data: Data, val_data: Data, step_size: int, options: Options,
        options_dict: Dict[str, Union[str, float]]) -> Dict[str, Any]:
    """Builds an DeepGRP model with updated options,
    trains it and validates it. Used for hyperopt optimization.

    Args:
        train_data (deepgrp.preprocessing.Data): Training data.
        val_data (deepgrp.preprocessing.Data): Validation data.
        step_size (int): Window size for the final evaluation.
        options (Options): General hyperparameter.
        options_dict (Dict[str, Union[str, float]]): Varying hyperparameter.

    Returns:
        Dict[str, Any]: Dictionary with results (Hyperopt compatible).

    """

    options = _update_options(options, options_dict)
    logdir = create_logdir(options)

    def _train_test(model):  # pragma: no cover
        extra_callback = [hp.KerasCallback(logdir, options_dict)]
        training((train_data, val_data), options, model, logdir,
                 extra_callback)
        K.clear_session()
        predictions = dgpred.predict_complete(step_size,
                                              options,
                                              logdir,
                                              val_data,
                                              use_mss=True)
        K.clear_session()
        is_not_na = np.logical_not(np.isnan(predictions[:, 0]))
        predictions_class = predictions[is_not_na].argmax(axis=1)
        dgpred.filter_segments(predictions_class, options.min_mss_len)
        _, metrics = dgpred.calculate_metrics(
            predictions_class, val_data.truelbl[:, is_not_na].argmax(axis=0))
        return metrics

    results = {
        'loss': np.inf,
        'Metrics': None,
        'options': options.todict(),
        'logdir': None,
        'status': STATUS_FAIL,
        'error': "",
    }
    try:
        K.clear_session()
        model = create_model(options)
        file_writer = tf.summary.create_file_writer(logdir)
        file_writer.set_as_default()
        metrics = _train_test(model)
        tfsum.scalar('MCC',
                     metrics['MCC'],
                     step=0,
                     description="Matthews correlation coefficient")
    except Exception as err:  # pylint: disable=broad-except
        _LOGGER.exception("Error occurred while training")
        results["error"] = str(err)
        results["status"] = STATUS_FAIL
    else:
        results["logdir"] = logdir
        results["loss"] = -1 * metrics['MCC']

        results["status"] = STATUS_OK
        results["Metrics"] = metrics
        if np.isnan(results["loss"]):
            results["status"] = STATUS_FAIL
            results["loss"] = np.inf
    finally:
        file_writer.close()
    if results["status"] == STATUS_FAIL and results["logdir"]:
        shutil.rmtree(results["logdir"])
    return results