def test_save_latest_and_best(self):
        model = FakeModel(params=params)
        mr = ModelRunner(model=model,
                         group_name='latest_and_best',
                         training=True,
                         params=params,
                         write_summary=False)
        mr.train(ModelRunnerTraining.dataset,
                 ModelRunnerTraining.val_dataset,
                 num_epochs=100)

        latest_checkpoint_dir = mr.trial_path / 'latest_checkpoint' / 'checkpoint'
        best_checkpoint_dir = mr.trial_path / 'best_checkpoint' / 'checkpoint'
        self.assertTrue(latest_checkpoint_dir.exists())
        self.assertTrue(best_checkpoint_dir.exists())

        latest_checkpoint = mr.latest_checkpoint_manager.latest_checkpoint
        latest_ckpt = tf.train.Checkpoint(step=tf.Variable(1))
        latest_ckpt.restore(latest_checkpoint).expect_partial()
        latest_checkpoint_step = latest_ckpt.step.numpy()

        best_checkpoint = mr.best_checkpoint_manager.latest_checkpoint
        best_ckpt = tf.train.Checkpoint(step=tf.Variable(1))
        best_ckpt.restore(best_checkpoint).expect_partial()
        best_checkpoint_step = best_ckpt.step.numpy()
        self.assertLess(best_checkpoint_step, latest_checkpoint_step)
 def test_train(self):
     model = FakeModel(params=params)
     mr = ModelRunner(model=model,
                      training=True,
                      params=params,
                      write_summary=False)
     mr.train(ModelRunnerTraining.dataset,
              ModelRunnerTraining.val_dataset,
              num_epochs=1)
def eval_main(dataset_dirs: List[pathlib.Path],
              mode: str,
              batch_size: int,
              use_gt_rope: bool,
              threshold: Optional[float] = None,
              old_compat: bool = False,
              take: Optional[int] = None,
              checkpoint: Optional[pathlib.Path] = None,
              trials_directory=pathlib.Path,
              **kwargs):
    ###############
    # Model
    ###############
    trial_path = checkpoint.parent.absolute()
    _, params = filepath_tools.create_or_load_trial(
        trial_path=trial_path, trials_directory=trials_directory)
    model_class = link_bot_classifiers.get_model(params['model_class'])

    ###############
    # Dataset
    ###############
    dataset = ClassifierDatasetLoader(dataset_dirs,
                                      load_true_states=True,
                                      use_gt_rope=use_gt_rope,
                                      old_compat=old_compat,
                                      threshold=threshold)
    tf_dataset = dataset.get_datasets(mode=mode, take=take)
    tf_dataset = balance(tf_dataset)

    ###############
    # Evaluate
    ###############
    tf_dataset = batch_tf_dataset(tf_dataset, batch_size, drop_remainder=True)

    model = model_class(hparams=params,
                        batch_size=batch_size,
                        scenario=dataset.scenario)
    # This call to model runner restores the model
    runner = ModelRunner(model=model,
                         training=False,
                         params=params,
                         checkpoint=checkpoint,
                         trial_path=trial_path,
                         key_metric=AccuracyMetric,
                         batch_metadata=dataset.batch_metadata)

    metrics = runner.val_epoch(tf_dataset)
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name:30s}: {metric_value}")
    return metrics
def train_main(
    dataset_dirs: List[pathlib.Path],
    model_hparams: pathlib.Path,
    log: str,
    batch_size: int,
    epochs: int,
    seed: int,
    use_gt_rope: bool,
    checkpoint: Optional[pathlib.Path] = None,
    ensemble_idx: Optional[int] = None,
    take: Optional[int] = None,
    trials_directory=pathlib.Path,
):
    print(Fore.CYAN + f"Using seed {seed}")

    model_hparams = hjson.load(model_hparams.open('r'))
    model_class = state_space_dynamics.get_model(model_hparams['model_class'])

    train_dataset = DynamicsDatasetLoader(dataset_dirs,
                                          use_gt_rope=use_gt_rope)
    val_dataset = DynamicsDatasetLoader(dataset_dirs, use_gt_rope=use_gt_rope)

    model_hparams.update(
        setup_hparams(batch_size, dataset_dirs, seed, train_dataset,
                      use_gt_rope))
    model = model_class(hparams=model_hparams,
                        batch_size=batch_size,
                        scenario=train_dataset.scenario)

    checkpoint_name, trial_path = setup_training_paths(checkpoint,
                                                       ensemble_idx, log,
                                                       model_hparams,
                                                       trials_directory)

    runner = ModelRunner(model=model,
                         training=True,
                         params=model_hparams,
                         checkpoint=checkpoint,
                         batch_metadata=train_dataset.batch_metadata,
                         trial_path=trial_path)

    train_tf_dataset, val_tf_dataset = setup_datasets(model_hparams,
                                                      batch_size, seed,
                                                      train_dataset,
                                                      val_dataset, take)

    runner.train(train_tf_dataset, val_tf_dataset, num_epochs=epochs)

    return trial_path
def eval_main(
    dataset_dirs: List[pathlib.Path],
    checkpoint: pathlib.Path,
    mode: str,
    batch_size: int,
    use_gt_rope: bool,
):
    test_dataset = DynamicsDatasetLoader(dataset_dirs, use_gt_rope=use_gt_rope)

    trials_directory = pathlib.Path('trials').absolute()
    trial_path = checkpoint.parent.absolute()
    _, params = filepath_tools.create_or_load_trial(
        trial_path=trial_path, trials_directory=trials_directory)
    model = state_space_dynamics.get_model(params['model_class'])
    net = model(hparams=params,
                batch_size=batch_size,
                scenario=test_dataset.scenario)

    runner = ModelRunner(model=net,
                         training=False,
                         checkpoint=checkpoint,
                         batch_metadata=test_dataset.batch_metadata,
                         trial_path=trial_path,
                         params=params)

    test_tf_dataset = test_dataset.get_datasets(mode=mode)
    test_tf_dataset = batch_tf_dataset(test_tf_dataset,
                                       batch_size,
                                       drop_remainder=True)
    validation_metrics = runner.val_epoch(test_tf_dataset)
    for name, value in validation_metrics.items():
        print(f"{name}: {value}")

    # more metrics that can't be expressed as just an average over metrics on each batch
    all_errors = None
    for batch in test_tf_dataset:
        outputs = runner.model(batch, training=False)
        errors_for_batch = test_dataset.scenario.classifier_distance(
            outputs, batch)
        if all_errors is not None:
            all_errors = tf.concat([all_errors, errors_for_batch], axis=0)
        else:
            all_errors = errors_for_batch
    print(f"90th percentile {np.percentile(all_errors.numpy(), 90)}")
    print(f"95th percentile {np.percentile(all_errors.numpy(), 95)}")
    print(f"99th percentile {np.percentile(all_errors.numpy(), 99)}")
    print(f"max {np.max(all_errors.numpy())}")
Esempio n. 6
0
def train_main(args):
    dataset_dirs = args.dataset_dirs
    checkpoint = args.checkpoint
    epochs = args.epochs
    trial_path, params = load_trial(checkpoint.parent.absolute())
    now = str(time())
    trial_path = trial_path.parent / (trial_path.name + '-observer-' + now)
    trial_path.mkdir(parents=True)
    batch_size = params['batch_size']
    params['encoder_trainable'] = False
    params['use_observation_feature_loss'] = True
    params['use_cfm_loss'] = False
    out_hparams_filename = trial_path / 'params.json'
    out_params_str = json.dumps(params)
    with out_hparams_filename.open("w") as out_hparams_file:
        out_hparams_file.write(out_params_str)

    train_dataset = DynamicsDatasetLoader(dataset_dirs)
    val_dataset = DynamicsDatasetLoader(dataset_dirs)

    model_class = state_space_dynamics.get_model(params['model_class'])
    model = model_class(hparams=params,
                        batch_size=batch_size,
                        scenario=train_dataset.scenario)

    seed = 0

    runner = ModelRunner(model=model,
                         training=True,
                         params=params,
                         checkpoint=checkpoint,
                         batch_metadata=train_dataset.batch_metadata,
                         trial_path=trial_path)

    train_tf_dataset, val_tf_dataset = train_test.setup_datasets(
        model_hparams=params,
        batch_size=batch_size,
        seed=seed,
        train_dataset=train_dataset,
        val_dataset=val_dataset)

    runner.train(train_tf_dataset, val_tf_dataset, num_epochs=epochs)

    return trial_path
def compute_classifier_threshold(
    dataset_dirs: List[pathlib.Path],
    checkpoint: pathlib.Path,
    mode: str,
    batch_size: int,
    use_gt_rope: bool,
):
    test_dataset = DynamicsDatasetLoader(dataset_dirs, use_gt_rope=use_gt_rope)

    trials_directory = pathlib.Path('trials').absolute()
    trial_path = checkpoint.parent.absolute()
    _, params = filepath_tools.create_or_load_trial(
        trial_path=trial_path, trials_directory=trials_directory)
    model = state_space_dynamics.get_model(params['model_class'])
    net = model(hparams=params,
                batch_size=batch_size,
                scenario=test_dataset.scenario)

    runner = ModelRunner(model=net,
                         training=False,
                         checkpoint=checkpoint,
                         batch_metadata=test_dataset.batch_metadata,
                         trial_path=trial_path,
                         params=params)

    test_tf_dataset = test_dataset.get_datasets(mode=mode)
    test_tf_dataset = batch_tf_dataset(test_tf_dataset,
                                       batch_size,
                                       drop_remainder=True)

    all_errors = None
    for batch in test_tf_dataset:
        outputs = runner.model(batch, training=False)
        errors_for_batch = test_dataset.scenario.classifier_distance(
            batch, outputs)
        if all_errors is not None:
            all_errors = tf.concat([all_errors, errors_for_batch], axis=0)
        else:
            all_errors = errors_for_batch

    classifier_threshold = np.percentile(all_errors.numpy(), 90)
    rospy.loginfo(f"90th percentile {classifier_threshold}")
    return classifier_threshold
def eval_main(
    dataset_dirs: List[pathlib.Path],
    checkpoint: pathlib.Path,
    mode: str,
    batch_size: int,
    **kwargs,
):
    ###############
    # Model
    ###############
    trial_path = checkpoint.parent.absolute()
    trials_directory = pathlib.Path('recovery_trials').absolute()
    _, params = filepath_tools.create_or_load_trial(
        trial_path=trial_path, trials_directory=trials_directory)
    scenario = get_scenario(params['scenario'])
    net = NNRecoveryModel(hparams=params, scenario=scenario, batch_size=1)

    ###############
    # Dataset
    ###############
    test_dataset = RecoveryDatasetLoader(dataset_dirs)
    test_tf_dataset = test_dataset.get_datasets(mode=mode)

    ###############
    # Evaluate
    ###############
    test_tf_dataset = batch_tf_dataset(test_tf_dataset,
                                       batch_size,
                                       drop_remainder=True)

    runner = ModelRunner(model=net,
                         training=False,
                         params=params,
                         checkpoint=checkpoint,
                         trial_path=trial_path,
                         batch_metadata=test_dataset.batch_metadata)
    validation_metrics = runner.val_epoch(test_tf_dataset)
    for name, value in validation_metrics.items():
        print(f"{name}: {value:.3f}")
def train_main(dataset_dirs: List[pathlib.Path],
               model_hparams: pathlib.Path,
               log: str,
               batch_size: int,
               epochs: int,
               seed: int,
               use_gt_rope: bool,
               checkpoint: Optional[pathlib.Path] = None,
               threshold: Optional[float] = None,
               ensemble_idx: Optional[int] = None,
               old_compat: bool = False,
               take: Optional[int] = None,
               validate: bool = True,
               trials_directory: Optional[pathlib.Path] = None,
               **kwargs):
    model_hparams = hjson.load(model_hparams.open('r'))
    model_class = link_bot_classifiers.get_model(model_hparams['model_class'])

    # set load_true_states=True when debugging
    train_dataset = ClassifierDatasetLoader(
        dataset_dirs=dataset_dirs,
        load_true_states=True,
        use_gt_rope=use_gt_rope,
        threshold=threshold,
        old_compat=old_compat,
    )
    val_dataset = ClassifierDatasetLoader(
        dataset_dirs=dataset_dirs,
        load_true_states=True,
        use_gt_rope=use_gt_rope,
        threshold=threshold,
        old_compat=old_compat,
    )

    model_hparams.update(
        setup_hparams(batch_size, dataset_dirs, seed, train_dataset,
                      use_gt_rope))
    model = model_class(hparams=model_hparams,
                        batch_size=batch_size,
                        scenario=train_dataset.scenario)

    checkpoint_name, trial_path = setup_training_paths(checkpoint,
                                                       ensemble_idx, log,
                                                       model_hparams,
                                                       trials_directory)

    if validate:
        mid_epoch_val_batches = 100
        val_every_n_batches = 100
        save_every_n_minutes = 20
        validate_first = True
    else:
        mid_epoch_val_batches = None
        val_every_n_batches = None
        save_every_n_minutes = None
        validate_first = False

    runner = ModelRunner(model=model,
                         training=True,
                         params=model_hparams,
                         trial_path=trial_path,
                         key_metric=AccuracyMetric,
                         checkpoint=checkpoint,
                         mid_epoch_val_batches=mid_epoch_val_batches,
                         val_every_n_batches=val_every_n_batches,
                         save_every_n_minutes=save_every_n_minutes,
                         validate_first=validate_first,
                         batch_metadata=train_dataset.batch_metadata)
    train_tf_dataset, val_tf_dataset = setup_datasets(model_hparams,
                                                      batch_size, seed,
                                                      train_dataset,
                                                      val_dataset, take)

    final_val_metrics = runner.train(train_tf_dataset,
                                     val_tf_dataset,
                                     num_epochs=epochs)

    return trial_path, final_val_metrics
def train_main(
    dataset_dirs: List[pathlib.Path],
    model_hparams: pathlib.Path,
    classifier_checkpoint: pathlib.Path,
    log: str,
    batch_size: int,
    epochs: int,
    seed: int,
    checkpoint: Optional[pathlib.Path] = None,
    ensemble_idx: Optional[int] = None,
    trials_directory=pathlib.Path,
    **kwargs,
):
    ###############
    # Datasets
    ###############
    train_dataset = RecoveryDatasetLoader(dataset_dirs)
    val_dataset = RecoveryDatasetLoader(dataset_dirs)

    ###############
    # Model
    ###############
    model_hparams = json.load((model_hparams).open('r'))
    model_hparams['recovery_dataset_hparams'] = train_dataset.hparams
    model_hparams['batch_size'] = batch_size
    model_hparams['seed'] = seed
    model_hparams['datasets'] = paths_to_json(dataset_dirs)
    model_hparams['latest_training_time'] = int(time.time())
    scenario = get_scenario(model_hparams['scenario'])

    # Dataset preprocessing
    train_tf_dataset = train_dataset.get_datasets(mode='train')
    val_tf_dataset = val_dataset.get_datasets(mode='val')

    train_tf_dataset = batch_tf_dataset(train_tf_dataset,
                                        batch_size,
                                        drop_remainder=True)
    val_tf_dataset = batch_tf_dataset(val_tf_dataset,
                                      batch_size,
                                      drop_remainder=True)

    train_tf_dataset = train_tf_dataset.shuffle(buffer_size=512, seed=seed)

    train_tf_dataset = train_tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    val_tf_dataset = val_tf_dataset.prefetch(tf.data.experimental.AUTOTUNE)

    model = NNRecoveryModel(hparams=model_hparams,
                            scenario=scenario,
                            batch_size=batch_size)

    ############
    # Initialize weights from classifier model by "restoring" from checkpoint
    ############
    if not checkpoint:
        # load in the weights for the conv & dense layers of the classifier's encoder, skip the last few layers
        classifier_model = tf.train.Checkpoint(conv_layers=model.conv_layers)
        classifier_root = tf.train.Checkpoint(model=classifier_model)
        classifier_checkpoint_manager = tf.train.CheckpointManager(
            classifier_root, classifier_checkpoint.as_posix(), max_to_keep=1)

        status = classifier_root.restore(
            classifier_checkpoint_manager.latest_checkpoint)
        status.expect_partial()
        status.assert_existing_objects_matched()
        assert classifier_checkpoint_manager.latest_checkpoint is not None
        print(Fore.MAGENTA + "Restored {}".format(
            classifier_checkpoint_manager.latest_checkpoint) + Fore.RESET)
    ############

    trial_path = None
    checkpoint_name = None
    if checkpoint:
        trial_path = checkpoint.parent.absolute()
        checkpoint_name = checkpoint.name
    trials_directory = pathlib.Path('recovery_trials').absolute()
    group_name = log if trial_path is None else None
    trial_path, _ = filepath_tools.create_or_load_trial(
        group_name=group_name,
        params=model_hparams,
        trial_path=trial_path,
        trials_directory=trials_directory,
        write_summary=False)
    runner = ModelRunner(model=model,
                         training=True,
                         params=model_hparams,
                         trial_path=trial_path,
                         val_every_n_batches=1,
                         mid_epoch_val_batches=100,
                         validate_first=True,
                         checkpoint=checkpoint,
                         batch_metadata=train_dataset.batch_metadata)

    # Train
    runner.train(train_tf_dataset, val_tf_dataset, num_epochs=epochs)

    return trial_path