Beispiel #1
0
def test_demo_mismatch():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    # observation size mismatch
    with pytest.raises(RuntimeError):
        mismatch_obs = setup_test_behavior_specs(False,
                                                 False,
                                                 vector_action_space=2,
                                                 vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_obs)
    # action mismatch
    with pytest.raises(RuntimeError):
        mismatch_act = setup_test_behavior_specs(False,
                                                 False,
                                                 vector_action_space=3,
                                                 vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_act)
    # action type mismatch
    with pytest.raises(RuntimeError):
        mismatch_act_type = setup_test_behavior_specs(True,
                                                      False,
                                                      vector_action_space=[2],
                                                      vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_act_type)
    # number obs mismatch
    with pytest.raises(RuntimeError):
        mismatch_obs_number = setup_test_behavior_specs(False,
                                                        True,
                                                        vector_action_space=2,
                                                        vector_obs_space=9)
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        mismatch_obs_number)
def test_demo_mismatch():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    # observation mismatch
    with pytest.raises(RuntimeError):
        brain_params_obs = BrainParameters(
            brain_name="test_brain",
            vector_observation_space_size=9,
            camera_resolutions=[],
            vector_action_space_size=[2],
            vector_action_descriptions=[],
            vector_action_space_type=1,
        )
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        brain_params_obs)
    # action mismatch
    with pytest.raises(RuntimeError):
        brain_params_act = BrainParameters(
            brain_name="test_brain",
            vector_observation_space_size=8,
            camera_resolutions=[],
            vector_action_space_size=[3],
            vector_action_descriptions=[],
            vector_action_space_type=1,
        )
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        brain_params_act)
    # action type mismatch
    with pytest.raises(RuntimeError):
        brain_params_type = BrainParameters(
            brain_name="test_brain",
            vector_observation_space_size=8,
            camera_resolutions=[],
            vector_action_space_size=[2],
            vector_action_descriptions=[],
            vector_action_space_type=0,
        )
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        brain_params_type)
    # vis obs mismatch
    with pytest.raises(RuntimeError):
        brain_params_vis = BrainParameters(
            brain_name="test_brain",
            vector_observation_space_size=8,
            camera_resolutions=[[30, 40]],
            vector_action_space_size=[2],
            vector_action_descriptions=[],
            vector_action_space_type=1,
        )
        _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1,
                                        brain_params_vis)
 def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None:
     super().__init__(specs, settings)
     self._ignore_done = True
     self._discriminator_network = DiscriminatorNetwork(specs, settings)
     _, self._demo_buffer = demo_to_buffer(
         settings.demo_path, 1, specs
     )  # This is supposed to be the sequence length but we do not have access here
     params = list(self._discriminator_network.parameters())
     self.optimizer = torch.optim.Adam(params, lr=settings.learning_rate)
Beispiel #4
0
def test_load_demo_dir():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    behavior_spec, pair_infos, total_expected = load_demonstration(
        path_prefix + "/test_demo_dir")
    assert np.sum(behavior_spec.observation_shapes[0]) == 8
    assert len(pair_infos) == total_expected

    _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1)
    assert len(demo_buffer["actions"]) == total_expected - 1
def test_load_demo():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    brain_parameters, pair_infos, total_expected = load_demonstration(
        path_prefix + "/test.demo")
    assert brain_parameters.brain_name == "Ball3DBrain"
    assert brain_parameters.vector_observation_space_size == 8
    assert len(pair_infos) == total_expected

    _, demo_buffer = demo_to_buffer(path_prefix + "/test.demo", 1)
    assert len(demo_buffer["actions"]) == total_expected - 1
Beispiel #6
0
def test_load_demo_dir():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    behavior_spec, pair_infos, total_expected = load_demonstration(
        path_prefix + "/test_demo_dir")
    assert np.sum(behavior_spec.observation_shapes[0]) == 8
    assert len(pair_infos) == total_expected

    _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1,
                                    BEHAVIOR_SPEC)
    assert (len(demo_buffer["continuous_action"]) == total_expected - 1
            or len(demo_buffer["discrete_action"]) == total_expected - 1)
Beispiel #7
0
def test_load_demo_dir():
    path_prefix = os.path.dirname(os.path.abspath(__file__))
    behavior_spec, pair_infos, total_expected = load_demonstration(
        path_prefix + "/test_demo_dir")
    assert np.sum(behavior_spec.observation_specs[0].shape) == 8
    assert len(pair_infos) == total_expected

    _, demo_buffer = demo_to_buffer(path_prefix + "/test_demo_dir", 1,
                                    BEHAVIOR_SPEC)
    assert (len(demo_buffer[BufferKey.CONTINUOUS_ACTION]) == total_expected - 1
            or len(
                demo_buffer[BufferKey.DISCRETE_ACTION]) == total_expected - 1)
    def __init__(
        self,
        policy: TFPolicy,
        strength: float,
        gamma: float,
        demo_path: str,
        encoding_size: int = 64,
        learning_rate: float = 3e-4,
        use_actions: bool = False,
        use_vail: bool = False,
    ):
        """
        The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
        :param policy: The policy of the learning model
        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
        reward multiplied by the strength parameter
        :param gamma: The time discounting factor used for this reward.
        :param demo_path: The path to the demonstration file
        :param num_epoch: The number of epochs to train over the training buffer for the discriminator.
        :param encoding_size: The size of the the hidden layers of the discriminator
        :param learning_rate: The Learning Rate used during GAIL updates.
        :param use_actions: Whether or not to use the actions for the discriminator.
        :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
        See https://arxiv.org/abs/1810.00821.
        """
        super().__init__(policy, strength, gamma)
        self.use_terminal_states = False

        self.model = GAILModel(
            policy, 128, learning_rate, encoding_size, use_actions, use_vail
        )
        _, self.demonstration_buffer = demo_to_buffer(
            demo_path, policy.sequence_length, policy.brain
        )
        self.has_updated = False
        self.update_dict: Dict[str, tf.Tensor] = {
            "gail_loss": self.model.loss,
            "gail_update_batch": self.model.update_batch,
            "gail_policy_estimate": self.model.mean_policy_estimate,
            "gail_expert_estimate": self.model.mean_expert_estimate,
        }
        if self.model.use_vail:
            self.update_dict["kl_loss"] = self.model.kl_loss
            self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
            self.update_dict["z_mean_expert"] = self.model.z_mean_expert
            self.update_dict["z_mean_policy"] = self.model.z_mean_policy
            self.update_dict["beta_update"] = self.model.update_beta

        self.stats_name_to_update_name = {
            "Losses/GAIL Loss": "gail_loss",
            "Policy/GAIL Policy Estimate": "gail_policy_estimate",
            "Policy/GAIL Expert Estimate": "gail_expert_estimate",
        }
Beispiel #9
0
    def __init__(
        self,
        policy: TFPolicy,
        policy_learning_rate: float,
        default_batch_size: int,
        default_num_epoch: int,
        strength: float,
        demo_path: str,
        steps: int,
        batch_size: int = None,
        num_epoch: int = None,
        samples_per_update: int = 0,
    ):
        """
        A BC trainer that can be used inline with RL.
        :param policy: The policy of the learning model
        :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate
            for the pretrainer.
        :param default_batch_size: The default batch size to use if batch_size isn't provided.
        :param default_num_epoch: The default num_epoch to use if num_epoch isn't provided.
        :param strength: The proportion of learning rate used to update through BC.
        :param steps: The number of steps to anneal BC training over. 0 for continuous training.
        :param demo_path: The path to the demonstration file.
        :param batch_size: The batch size to use during BC training.
        :param num_epoch: Number of epochs to train for during each update.
        :param samples_per_update: Maximum number of samples to train on during each BC update.
        """
        self.policy = policy
        self.current_lr = policy_learning_rate * strength
        self.model = BCModel(policy, self.current_lr, steps)
        _, self.demonstration_buffer = demo_to_buffer(
            demo_path, policy.sequence_length, policy.brain
        )

        self.batch_size = batch_size if batch_size else default_batch_size
        self.num_epoch = num_epoch if num_epoch else default_num_epoch
        self.n_sequences = max(
            min(self.batch_size, self.demonstration_buffer.num_experiences)
            // policy.sequence_length,
            1,
        )

        self.has_updated = False
        self.use_recurrent = self.policy.use_recurrent
        self.samples_per_update = samples_per_update
        self.out_dict = {
            "loss": self.model.loss,
            "update": self.model.update_batch,
            "learning_rate": self.model.annealed_learning_rate,
        }
    def __init__(self, brain, trainer_parameters, training, load, seed,
                 run_id):
        """
        Responsible for collecting experiences and training PPO model.
        :param  trainer_parameters: The parameters for the trainer (dictionary).
        :param training: Whether the trainer is set for training.
        :param load: Whether the model should be loaded.
        :param seed: The seed the model will be initialized with
        :param run_id: The identifier of the current run
        """
        super(OfflineBCTrainer, self).__init__(brain, trainer_parameters,
                                               training, load, seed, run_id)

        self.param_keys = [
            "batch_size",
            "summary_freq",
            "max_steps",
            "batches_per_epoch",
            "use_recurrent",
            "hidden_units",
            "learning_rate",
            "num_layers",
            "sequence_length",
            "memory_size",
            "model_path",
            "demo_path",
        ]

        self.check_param_keys()
        self.batches_per_epoch = trainer_parameters["batches_per_epoch"]
        self.n_sequences = max(
            int(trainer_parameters["batch_size"] /
                self.policy.sequence_length), 1)

        brain_params, self.demonstration_buffer = demo_to_buffer(
            trainer_parameters["demo_path"], self.policy.sequence_length)

        policy_brain = copy.deepcopy(brain.__dict__)
        expert_brain = copy.deepcopy(brain_params.__dict__)
        policy_brain.pop("brain_name")
        expert_brain.pop("brain_name")
        policy_brain.pop("vector_action_descriptions")
        expert_brain.pop("vector_action_descriptions")
        if expert_brain != policy_brain:
            raise UnityTrainerException(
                "The provided demonstration is not compatible with the "
                "brain being used for performance evaluation.")
Beispiel #11
0
    def __init__(
        self,
        policy: TorchPolicy,
        settings: BehavioralCloningSettings,
        policy_learning_rate: float,
        default_batch_size: int,
        default_num_epoch: int,
    ):
        """
        A BC trainer that can be used inline with RL.
        :param policy: The policy of the learning model
        :param settings: The settings for BehavioralCloning including LR strength, batch_size,
        num_epochs, samples_per_update and LR annealing steps.
        :param policy_learning_rate: The initial Learning Rate of the policy. Used to set an appropriate learning rate
            for the pretrainer.
        """
        self.policy = policy
        self._anneal_steps = settings.steps
        self.current_lr = policy_learning_rate * settings.strength

        learning_rate_schedule: ScheduleType = ScheduleType.LINEAR if self._anneal_steps > 0 else ScheduleType.CONSTANT
        self.decay_learning_rate = ModelUtils.DecayedValue(
            learning_rate_schedule, self.current_lr, 1e-10, self._anneal_steps
        )
        params = self.policy.actor_critic.parameters()
        self.optimizer = torch.optim.Adam(params, lr=self.current_lr)
        _, self.demonstration_buffer = demo_to_buffer(
            settings.demo_path, policy.sequence_length, policy.behavior_spec
        )
        self.batch_size = (
            settings.batch_size if settings.batch_size else default_batch_size
        )
        self.num_epoch = settings.num_epoch if settings.num_epoch else default_num_epoch
        self.n_sequences = max(
            min(self.batch_size, self.demonstration_buffer.num_experiences)
            // policy.sequence_length,
            1,
        )

        self.has_updated = False
        self.use_recurrent = self.policy.use_recurrent
        self.samples_per_update = settings.samples_per_update
Beispiel #12
0
    def __init__(self, policy: TFPolicy, settings: GAILSettings):
        """
        The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
        :param policy: The policy of the learning model
        :param settings: The settings for this GAILRewardSignal.
        See https://arxiv.org/abs/1810.00821.
        """
        super().__init__(policy, settings)
        self.use_terminal_states = False

        self.model = GAILModel(
            policy,
            128,
            settings.learning_rate,
            settings.encoding_size,
            settings.use_actions,
            settings.use_vail,
        )
        _, self.demonstration_buffer = demo_to_buffer(settings.demo_path,
                                                      policy.sequence_length,
                                                      policy.brain)
        self.has_updated = False
        self.update_dict: Dict[str, tf.Tensor] = {
            "gail_loss": self.model.loss,
            "gail_update_batch": self.model.update_batch,
            "gail_policy_estimate": self.model.mean_policy_estimate,
            "gail_expert_estimate": self.model.mean_expert_estimate,
        }
        if self.model.use_vail:
            self.update_dict["kl_loss"] = self.model.kl_loss
            self.update_dict["z_log_sigma_sq"] = self.model.z_log_sigma_sq
            self.update_dict["z_mean_expert"] = self.model.z_mean_expert
            self.update_dict["z_mean_policy"] = self.model.z_mean_policy
            self.update_dict["beta_update"] = self.model.update_beta

        self.stats_name_to_update_name = {
            "Losses/GAIL Loss": "gail_loss",
            "Policy/GAIL Policy Estimate": "gail_policy_estimate",
            "Policy/GAIL Expert Estimate": "gail_expert_estimate",
        }
    def __init__(
        self,
        policy: TFPolicy,
        strength: float,
        gamma: float,
        demo_path: str,
        num_epoch: int = 3,
        encoding_size: int = 64,
        learning_rate: float = 3e-4,
        samples_per_update: int = 0,
        use_actions: bool = False,
        use_vail: bool = False,
    ):
        """
        The GAIL Reward signal generator. https://arxiv.org/abs/1606.03476
        :param policy: The policy of the learning model
        :param strength: The scaling parameter for the reward. The scaled reward will be the unscaled
        reward multiplied by the strength parameter
        :param gamma: The time discounting factor used for this reward.
        :param demo_path: The path to the demonstration file
        :param num_epoch: The number of epochs to train over the training buffer for the discriminator.
        :param encoding_size: The size of the the hidden layers of the discriminator
        :param learning_rate: The Learning Rate used during GAIL updates.
        :param samples_per_update: The maximum number of samples to update during GAIL updates.
        :param use_actions: Whether or not to use the actions for the discriminator.
        :param use_vail: Whether or not to use a variational bottleneck for the discriminator.
        See https://arxiv.org/abs/1810.00821.
        """
        super().__init__(policy, strength, gamma)
        self.num_epoch = num_epoch
        self.samples_per_update = samples_per_update
        self.use_terminal_states = False

        self.model = GAILModel(
            policy.model, 128, learning_rate, encoding_size, use_actions, use_vail
        )
        _, self.demonstration_buffer = demo_to_buffer(demo_path, policy.sequence_length)
        self.has_updated = False
Beispiel #14
0
 def load_demo(self):
     brain_params, demo_buffer = demo_to_buffer(demo_path,1)
     update_buffer = demo_buffer.update_buffer
     return update_buffer