Ejemplo n.º 1
0
    def test_sample_collector_by_number_success(self, mock_env_reset, mock_env_step, mock_envs_reset,
                                                mock_envs_step) -> None:
        dummy_env = Env()
        dummy_env.observation_space = Box(-1, 1, [STATE_DIM])
        dummy_env.action_space = Box(-1, 1, [ACTION_DIM])
        mock_env_reset.return_value = self.dummy_state
        mock_env_step.return_value = (self.dummy_state, self.dummy_reward, self.dummy_done, self.dummy_info)
        dummy_env.reset = mock_env_reset
        dummy_env.step = mock_env_step

        dummy_envs = DummyVectorEnv(N_ENVS, STATE_DIM, ACTION_DIM)
        mock_envs_reset.return_value = self.dummy_states
        mock_envs_step.return_value = (self.dummy_next_states, self.dummy_rewards, self.dummy_dones, {})
        dummy_envs.reset = mock_envs_reset
        dummy_envs.step = mock_envs_step

        dummy_env_container = EnvContainer(dummy_env, dummy_envs)
        mock_envs_reset.assert_called_once_with()  # __init__ of EnvContainer calls reset

        actor: nn.Module = ProbMLPConstantLogStd(STATE_DIM, ACTION_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
        scaler: nn.Module = DummyNet()
        tanh: nn.Module = nn.Tanh()
        action_getter: ActionGetter = ActionGetterModule(actor, scaler)
        sample_collector: SampleCollector = SampleCollectorV0(dummy_env_container, action_getter, N_ENVS * 10, 1)

        array_dict: ArrayDict = sample_collector.collect_samples_by_number()
        self.assertEqual(mock_envs_reset.call_count, 2)
        self.assertEqual(mock_envs_step.call_count, 10)

        collected_states = array_dict.get(ArrayKey.states)
        self.assertTupleEqual(collected_states.shape, (N_ENVS * 10, STATE_DIM))
Ejemplo n.º 2
0
    def test_module_action_getter_1d_success(self):
        actor: nn.Module = ProbMLPConstantLogStd(STATE_DIM, ACTION_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
        scaler: nn.Module = DummyNet()
        activation: nn.Module = nn.Tanh()

        action_getter: ActionGetter = ActionGetterModule(actor, scaler)
        dummy_state = self.dummy_states[0, :]
        output_action = action_getter.get_action(dummy_state)

        self.assertEqual(len(output_action.shape), 1, "1D case output shape is not 1D")
Ejemplo n.º 3
0
    def test_module_updater_optimizer_change_action_success(self) -> None:
        relu = nn.ReLU()
        tanh = nn.Tanh()
        actor = ProbMLPConstantLogStd(STATE_DIM, ACTION_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
        scaler = DummyNet()
        action_getter = ActionGetterModule(actor, scaler)
        optimizer = RAdam(actor.parameters(), lr=3e-4)
        module_updater: ModuleUpdater = ModuleUpdaterOptimizer(optimizer)

        action1 = action_getter.get_action(self.dummy_states)
        action2 = action_getter.get_action(self.dummy_states)

        output, log_std = actor.forward(self.dummy_states_tensor)
        mse_loss = nn.MSELoss()
        loss = mse_loss.forward(output, self.dummy_target_tensor)

        module_updater.update_module(loss)

        action3 = action_getter.get_action(self.dummy_states)

        np.testing.assert_array_equal(action1, action2)
        self.assertFalse(np.array_equal(action1, action3))
Ejemplo n.º 4
0
    def test_module_action_getter_sample_success(self, actor_forward, scaler_forward):
        scaler_forward.return_value = self.dummy_states_scaled
        actor_forward.return_value = (self.dummy_actions, self.dummy_log_std)
        actor: nn.Module = ProbMLPConstantLogStd(STATE_DIM, ACTION_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
        actor.forward = actor_forward
        scaler: nn.Module = DummyNet()
        scaler.forward = scaler_forward

        action_getter: ActionGetter = ActionGetterModule(actor, scaler)

        actions, log_prob = action_getter.sample_action(self.dummy_states)
        self.assertEqual(len(actions.shape), 2, "2D case output shape is not 2D")
        self.assertTupleEqual(actions.shape, (N_EXAMPLES, ACTION_DIM), "2D case output shape is inconsistent")
        np.testing.assert_array_equal(actions, self.dummy_actions)

        np.testing.assert_array_equal(scaler_forward.call_args[0][0], self.dummy_states)
        np.testing.assert_array_equal(actor_forward.call_args[0][0], self.dummy_states_scaled)
Ejemplo n.º 5
0
 def test_prob_mlp_forward(self) -> None:
     net: nn.Module = ProbMLPConstantLogStd(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
     dummy_output = net.forward(self.dummy_features)
     self.assertEqual(len(dummy_output), 2, "ProbMLP output should be two-handed.")
     self.assertEqual(dummy_output[0].shape, (N_EXAMPLES, OUTPUT_DIM), "ProbMLP mu output shape is inconsistent")
     self.assertEqual(dummy_output[1].shape, (N_EXAMPLES, OUTPUT_DIM), "ProbMLP sigma output shape is inconsistent")
Ejemplo n.º 6
0
 def test_prob_mlp_initialization(self) -> None:
     net: nn.Module = ProbMLPConstantLogStd(INPUT_DIM, OUTPUT_DIM, HIDDEN_DIMS, ACTIVATION, FINAL_LAYER_ACTIVATION, LOG_STD)
     self.assertTrue(True, "ProbMLP initialized with error.")
Ejemplo n.º 7
0
def main():
    n_envs = len(os.sched_getaffinity(0))
    factory = FallingEnvFactory()
    # factory = HalfCheetahEnvFactory()
    # factory = HumanoidFallingEnvFactory()
    env: Env = factory.make_env()
    envs: VectorEnv = AsyncVectorEnv([factory.make_env for _ in range(n_envs)])
    env_container = EnvContainer(env, envs)

    state_dim, = env.observation_space.shape
    action_dim, = env.action_space.shape
    relu = nn.ReLU()
    tanh = nn.Tanh()
    identity = nn.Identity()

    actor = ProbMLPConstantLogStd(state_dim, action_dim, [256, 256], relu, tanh, -1.0)
    critic = MultiLayerPerceptron(state_dim, 1, [256, 256], relu, identity)
    scaler_ = StandardScaler()
    print("Fit scaler")
    env.reset()
    state_seq = []
    for _ in tqdm(range(512)):
        action = env.action_space.sample()
        state, _, done, _ = env.step(action)
        state_seq.append(state)
        if done:
            env.reset()
    state_seq = np.stack(state_seq)
    scaler_.fit(state_seq)
    scaler = ScalerNet(scaler_)

    module_dict = ModuleDict()
    module_dict.set(ModuleKey.actor, actor)
    module_dict.set(ModuleKey.scaler, scaler)
    module_dict.set(ModuleKey.critic, critic)

    action_getter: ActionGetter = ActionGetterModule(actor, scaler)
    sample_collector: SampleCollector = SampleCollectorV0(env_container, action_getter, 2048, 1)

    mse_loss = nn.MSELoss()
    critic_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic, TensorKey.cumulative_reward_predictions_tensor)
    critic_loss_calculator: LossCalculator = \
        LossCalculatorInputTarget(TensorKey.cumulative_reward_predictions_tensor, TensorKey.cumulative_rewards_tensor,
                                  mse_loss)

    actor_tensor_inserter: TensorInserter = \
        TensorInserterTensorize(ArrayKey.states, TensorKey.states_tensor) + \
        TensorInserterTensorize(ArrayKey.actions, TensorKey.actions_tensor) + \
        TensorInserterTensorize(ArrayKey.log_probs, TensorKey.log_probs_tensor) + \
        TensorInserterTensorize(ArrayKey.cumulative_rewards, TensorKey.cumulative_rewards_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.scaler, TensorKey.states_tensor) + \
        TensorInserterForward(TensorKey.states_tensor, ModuleKey.critic,
                              TensorKey.cumulative_reward_predictions_tensor) + \
        TensorInserterLambda([TensorKey.cumulative_rewards_tensor, TensorKey.cumulative_reward_predictions_tensor],
                             lambda x, y: x - y, TensorKey.advantages_tensor) + \
        TensorInserterModuleLambda(ModuleKey.actor, [TensorKey.states_tensor, TensorKey.actions_tensor],
                                   lambda actor, state, action: actor.get_log_prob(state, action),
                                   TensorKey.new_log_probs_tensor) + \
        TensorInserterLambda([TensorKey.new_log_probs_tensor, TensorKey.log_probs_tensor, TensorKey.advantages_tensor],
                             get_ppo_surrogate_tensor, TensorKey.ppo_surrogates_tensor)

    actor_loss_calculator: LossCalculator = \
        LossCalculatorLambda([TensorKey.ppo_surrogates_tensor], lambda x: -torch.mean(x))

    actor_optimizer = RAdam(params=actor.parameters(), lr=3e-4)
    actor_updater: ModuleUpdater = ModuleUpdaterOptimizer(actor_optimizer)
    critic_optimizer = RAdam(params=critic.parameters(), lr=3e-4)
    critic_updater: ModuleUpdater = ModuleUpdaterOptimizer(critic_optimizer)

    actor_trainee = Trainee([actor], actor_updater, actor_tensor_inserter, actor_loss_calculator, 10)
    critic_trainee = Trainee([critic], critic_updater, critic_tensor_inserter, critic_loss_calculator, 10)

    trainer = RLTrainer(sample_collector, [critic_trainee, actor_trainee], 100000, 128)
    trainer.train(module_dict)