def test_create_model(rnn): got = model.create_model(model.Options(attention=True, rnn=rnn)).get_config() got = json.loads(json.dumps(got)) tfversion = version.parse(tf.__version__) with pathlib.Path(__file__).with_suffix('.json').open('r') as file: expected = json.load(file)[f"{tfversion.major}.{tfversion.minor}"] assert got == expected[rnn]
def train(args: argparse.Namespace, options: dgmodel.Options) -> None: """Train deepgrp. Args: args (argparse.Namespace): Command line arguments. Includes: 'parameter', 'trainfile', 'validfile', 'bedfile', 'logdir', 'modelfile' options (dgmodel.Options): Default and user provided options. """ with open(args.parameter, "r") as file: parameter = dgmodel.Options.from_toml(file) parameter.fromdict(options.todict()) logdir = args.logdir # get which chromosome is used for training and which for validation train_chr = os.path.basename(args.trainfile).split('.')[0] val_chr = os.path.basename(args.validfile).split('.')[0] # check if logdir exists? if not os.path.isdir(logdir): os.mkdir(logdir) # load data _LOG.info("Loading in all data necessary from %s, %s, %s", args.trainfile, args.validfile, args.bedfile) train_fwd = np.load(args.trainfile, allow_pickle=False)['fwd'] val_fwd = np.load(args.validfile, allow_pickle=False)['fwd'] # preprocess ## preprocess y (bedfile) y_train = dgpreprocess.preprocess_y(args.bedfile, train_chr, train_fwd.shape[1], parameter.repeats_to_search) y_val = dgpreprocess.preprocess_y(args.bedfile, val_chr, val_fwd.shape[1], parameter.repeats_to_search) ## preprocess training and validation data train_fwd, y_train = dgpreprocess.drop_start_end_n(train_fwd, y_train) val_fwd, y_val = dgpreprocess.drop_start_end_n(val_fwd, y_val) train_data = dgpreprocess.Data(train_fwd, y_train) val_data = dgpreprocess.Data(val_fwd, y_val) # training _LOG.info("Creating model for training") model = dgmodel.create_model(parameter) _LOG.info("Training Model") dgtrain.training((train_data, val_data), parameter, model, logdir) # save model in h5 format _LOG.info("Saving model as %s", args.modelfile) model.save(args.modelfile)
def setup_prediction_from_options_checkpoint(options: Options, logdir: PathLike) -> keras.Model: """Creates a DeepGRP model and loads latest weights from Checkpoint. Args: options (deepgrp.model.Options): Hyperparameter logdir (PathLike): Directory of the checkpoint. Returns: keras.Model: compiled functional DeepGRP model with restored weights. """ model = create_model(options) ckpt = tf.train.Checkpoint() manager = tf.train.CheckpointManager(ckpt, logdir, max_to_keep=None) model.load_weights(manager.latest_checkpoint).expect_partial() return model
def test_predict(monkeypatch, tmpdir, mss_bool): # helper functions: def dgpred_predict_dummy(model, data_iterator, output_shape, step_size): assert isinstance(model, tf.keras.Model) assert isinstance(data_iterator, tf.data.Dataset) assert isinstance(output_shape[0], int) and isinstance( output_shape[1], int) assert step_size == 3 return np.ones((100, 5)) def apply_mss_dummy(prediction, options): assert isinstance(prediction, np.ndarray) assert isinstance(options, dgmodel.Options) assert mss_bool return np.ones((100, 5)) def softmax_dummy(prediction): assert isinstance(prediction, np.ndarray) assert not mss_bool return np.ones((100, 5)) monkeypatch.setattr(dgpred, "predict", dgpred_predict_dummy) monkeypatch.setattr(dgpred, "apply_mss", apply_mss_dummy) monkeypatch.setattr(dgpred, "softmax", softmax_dummy) # variables to give for testing opt = dgmodel.Options(project_root_dir=str(tmpdir), n_batches=1, n_epochs=1, batch_size=10, vecsize=10) dnasequence = "".join( np.random.choice(["N", "A", "C", "G", "T"], size=(100))) model = dgmodel.create_model(opt) dgparser._predict( # pylint: disable=protected-access dnasequence=dnasequence, model=model, options=opt, step_size=3, use_mss=mss_bool)
def build_and_optimize( train_data: Data, val_data: Data, step_size: int, options: Options, options_dict: Dict[str, Union[str, float]]) -> Dict[str, Any]: """Builds an DeepGRP model with updated options, trains it and validates it. Used for hyperopt optimization. Args: train_data (deepgrp.preprocessing.Data): Training data. val_data (deepgrp.preprocessing.Data): Validation data. step_size (int): Window size for the final evaluation. options (Options): General hyperparameter. options_dict (Dict[str, Union[str, float]]): Varying hyperparameter. Returns: Dict[str, Any]: Dictionary with results (Hyperopt compatible). """ options = _update_options(options, options_dict) logdir = create_logdir(options) def _train_test(model): # pragma: no cover extra_callback = [hp.KerasCallback(logdir, options_dict)] training((train_data, val_data), options, model, logdir, extra_callback) K.clear_session() predictions = dgpred.predict_complete(step_size, options, logdir, val_data, use_mss=True) K.clear_session() is_not_na = np.logical_not(np.isnan(predictions[:, 0])) predictions_class = predictions[is_not_na].argmax(axis=1) dgpred.filter_segments(predictions_class, options.min_mss_len) _, metrics = dgpred.calculate_metrics( predictions_class, val_data.truelbl[:, is_not_na].argmax(axis=0)) return metrics results = { 'loss': np.inf, 'Metrics': None, 'options': options.todict(), 'logdir': None, 'status': STATUS_FAIL, 'error': "", } try: K.clear_session() model = create_model(options) file_writer = tf.summary.create_file_writer(logdir) file_writer.set_as_default() metrics = _train_test(model) tfsum.scalar('MCC', metrics['MCC'], step=0, description="Matthews correlation coefficient") except Exception as err: # pylint: disable=broad-except _LOGGER.exception("Error occurred while training") results["error"] = str(err) results["status"] = STATUS_FAIL else: results["logdir"] = logdir results["loss"] = -1 * metrics['MCC'] results["status"] = STATUS_OK results["Metrics"] = metrics if np.isnan(results["loss"]): results["status"] = STATUS_FAIL results["loss"] = np.inf finally: file_writer.close() if results["status"] == STATUS_FAIL and results["logdir"]: shutil.rmtree(results["logdir"]) return results