def test_merlin_algorithm(self): batch_size = 100 steps_per_episode = 15 gap = 10 env = RNNPolicyUnittestEnv(batch_size, steps_per_episode, gap, obs_dim=3) eval_env = RNNPolicyUnittestEnv(100, steps_per_episode, gap, obs_dim=3) algorithm = _create_merlin_algorithm(env, learning_rate=3e-3, debug_summaries=False) for i in range(500): algorithm.train_iter() if (i + 1) % 100 == 0: eval_env.reset() eval_time_step = unroll(eval_env, algorithm, steps_per_episode - 1) logging.info("%d reward=%f" % (i, float(eval_time_step.reward.mean()))) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=1e-2)
def test_sac_algorithm_discrete(self, use_parallel_network): num_env = 1 config = TrainerConfig( root_dir="dummy", unroll_length=1, mini_batch_length=2, mini_batch_size=64, initial_collect_steps=500, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=num_env, ) env_class = PolicyUnittestEnv steps_per_episode = 13 env = env_class(num_env, steps_per_episode, action_type=ActionType.Discrete) eval_env = env_class(100, steps_per_episode, action_type=ActionType.Discrete) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (10, 10) q_network = partial(QNetwork, fc_layer_params=fc_layer_params) alg2 = SacAlgorithm(observation_spec=obs_spec, action_spec=action_spec, q_network_cls=q_network, use_parallel_network=use_parallel_network, env=env, config=config, critic_optimizer=alf.optimizers.Adam(lr=1e-3), alpha_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MySAC") eval_env.reset() for i in range(700): alg2.train_iter() if i < config.initial_collect_steps: continue eval_env.reset() eval_time_step = unroll(eval_env, alg2, steps_per_episode - 1) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(eval_time_step.reward.mean())), n_seconds=1) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=0.2)
def test_sarsa(self, on_policy=False, sac=True, use_rnn=False, priority_replay=False): logging.info("sac=%d on_policy=%s use_rnn=%s" % (sac, on_policy, use_rnn)) env_class = PolicyUnittestEnv iterations = 500 num_env = 128 if on_policy: num_env = 128 steps_per_episode = 12 env = env_class(num_env, steps_per_episode, action_type=ActionType.Continuous) eval_env = env_class(100, steps_per_episode, action_type=ActionType.Continuous) algorithm = _create_algorithm(env, on_policy=on_policy, sac=sac, use_rnn=use_rnn, priority_replay=priority_replay) env.reset() eval_env.reset() for i in range(iterations): algorithm.train_iter() eval_env.reset() eval_time_step = unroll(eval_env, algorithm, steps_per_episode - 1) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(eval_time_step.reward.mean())), n_seconds=1) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=0.3)
def test_ddpg_algorithm(self, num_critic_replicas, reward_dim): num_env = 128 num_eval_env = 100 steps_per_episode = 13 config = TrainerConfig( root_dir="dummy", unroll_length=steps_per_episode, mini_batch_length=2, mini_batch_size=128, initial_collect_steps=steps_per_episode, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=num_env, ) env_class = PolicyUnittestEnv env = env_class(num_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) eval_env = env_class(num_eval_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (16, 16) actor_network = functools.partial(ActorNetwork, fc_layer_params=fc_layer_params) critic_network = functools.partial( CriticNetwork, output_tensor_spec=env.reward_spec(), joint_fc_layer_params=fc_layer_params) alg = DdpgAlgorithm(observation_spec=obs_spec, action_spec=action_spec, actor_network_ctor=actor_network, critic_network_ctor=critic_network, reward_weights=[1, 2, 3], env=env, config=config, num_critic_replicas=num_critic_replicas, use_parallel_network=num_critic_replicas > 1, actor_optimizer=alf.optimizers.Adam(lr=1e-2), critic_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MyDDPG") for _ in range(500): alg.train_iter() eval_env.reset() epsilon_greedy = 0.0 eval_time_step = unroll(eval_env, alg, steps_per_episode - 1, epsilon_greedy) print(eval_time_step.reward.mean()) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=2e-1)
def test_sac_algorithm(self, use_parallel_network, reward_dim): num_env = 1 config = TrainerConfig( root_dir="dummy", unroll_length=1, mini_batch_length=2, mini_batch_size=64, initial_collect_steps=500, whole_replay_buffer_training=False, clear_replay_buffer=False, num_envs=1, ) env_class = PolicyUnittestEnv steps_per_episode = 13 env = env_class(num_env, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) eval_env = env_class(100, steps_per_episode, action_type=ActionType.Continuous, reward_dim=reward_dim) obs_spec = env._observation_spec action_spec = env._action_spec fc_layer_params = (10, 10) continuous_projection_net_ctor = partial( alf.networks.NormalProjectionNetwork, state_dependent_std=True, scale_distribution=True, std_transform=clipped_exp) actor_network = partial( ActorDistributionNetwork, fc_layer_params=fc_layer_params, continuous_projection_net_ctor=continuous_projection_net_ctor) critic_network = partial(CriticNetwork, output_tensor_spec=env.reward_spec(), joint_fc_layer_params=fc_layer_params) alg = SacAlgorithm(observation_spec=obs_spec, action_spec=action_spec, actor_network_cls=actor_network, critic_network_cls=critic_network, use_parallel_network=use_parallel_network, use_entropy_reward=reward_dim == 1, env=env, config=config, actor_optimizer=alf.optimizers.Adam(lr=1e-2), critic_optimizer=alf.optimizers.Adam(lr=1e-2), alpha_optimizer=alf.optimizers.Adam(lr=1e-2), debug_summaries=False, name="MySAC") eval_env.reset() for i in range(700): alg.train_iter() if i < config.initial_collect_steps: continue eval_env.reset() eval_time_step = unroll(eval_env, alg, steps_per_episode - 1) logging.log_every_n_seconds( logging.INFO, "%d reward=%f" % (i, float(eval_time_step.reward.mean())), n_seconds=1) self.assertAlmostEqual(1.0, float(eval_time_step.reward.mean()), delta=0.3)