Example #1
0
    def test_merlin_algorithm(self):
        batch_size = 100
        steps_per_episode = 15
        gap = 10
        env = RNNPolicyUnittestEnv(batch_size,
                                   steps_per_episode,
                                   gap,
                                   obs_dim=3)
        eval_env = RNNPolicyUnittestEnv(100, steps_per_episode, gap, obs_dim=3)

        algorithm = _create_merlin_algorithm(env,
                                             learning_rate=3e-3,
                                             debug_summaries=False)

        for i in range(500):
            algorithm.train_iter()
            if (i + 1) % 100 == 0:
                eval_env.reset()
                eval_time_step = unroll(eval_env, algorithm,
                                        steps_per_episode - 1)
                logging.info("%d reward=%f" %
                             (i, float(eval_time_step.reward.mean())))

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=1e-2)
Example #2
0
    def test_sac_algorithm_discrete(self, use_parallel_network):
        num_env = 1
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=1,
            mini_batch_length=2,
            mini_batch_size=64,
            initial_collect_steps=500,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=num_env,
        )
        env_class = PolicyUnittestEnv

        steps_per_episode = 13
        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Discrete)

        eval_env = env_class(100,
                             steps_per_episode,
                             action_type=ActionType.Discrete)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (10, 10)

        q_network = partial(QNetwork, fc_layer_params=fc_layer_params)

        alg2 = SacAlgorithm(observation_spec=obs_spec,
                            action_spec=action_spec,
                            q_network_cls=q_network,
                            use_parallel_network=use_parallel_network,
                            env=env,
                            config=config,
                            critic_optimizer=alf.optimizers.Adam(lr=1e-3),
                            alpha_optimizer=alf.optimizers.Adam(lr=1e-2),
                            debug_summaries=False,
                            name="MySAC")

        eval_env.reset()
        for i in range(700):
            alg2.train_iter()
            if i < config.initial_collect_steps:
                continue
            eval_env.reset()
            eval_time_step = unroll(eval_env, alg2, steps_per_episode - 1)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" % (i, float(eval_time_step.reward.mean())),
                n_seconds=1)

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=0.2)
Example #3
0
    def test_sarsa(self,
                   on_policy=False,
                   sac=True,
                   use_rnn=False,
                   priority_replay=False):
        logging.info("sac=%d on_policy=%s use_rnn=%s" %
                     (sac, on_policy, use_rnn))
        env_class = PolicyUnittestEnv
        iterations = 500
        num_env = 128
        if on_policy:
            num_env = 128
        steps_per_episode = 12
        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Continuous)
        eval_env = env_class(100,
                             steps_per_episode,
                             action_type=ActionType.Continuous)

        algorithm = _create_algorithm(env,
                                      on_policy=on_policy,
                                      sac=sac,
                                      use_rnn=use_rnn,
                                      priority_replay=priority_replay)

        env.reset()
        eval_env.reset()
        for i in range(iterations):
            algorithm.train_iter()

            eval_env.reset()
            eval_time_step = unroll(eval_env, algorithm, steps_per_episode - 1)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" % (i, float(eval_time_step.reward.mean())),
                n_seconds=1)

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=0.3)
Example #4
0
    def test_ddpg_algorithm(self, num_critic_replicas, reward_dim):
        num_env = 128
        num_eval_env = 100
        steps_per_episode = 13
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=steps_per_episode,
            mini_batch_length=2,
            mini_batch_size=128,
            initial_collect_steps=steps_per_episode,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=num_env,
        )
        env_class = PolicyUnittestEnv

        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Continuous,
                        reward_dim=reward_dim)

        eval_env = env_class(num_eval_env,
                             steps_per_episode,
                             action_type=ActionType.Continuous,
                             reward_dim=reward_dim)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (16, 16)

        actor_network = functools.partial(ActorNetwork,
                                          fc_layer_params=fc_layer_params)

        critic_network = functools.partial(
            CriticNetwork,
            output_tensor_spec=env.reward_spec(),
            joint_fc_layer_params=fc_layer_params)

        alg = DdpgAlgorithm(observation_spec=obs_spec,
                            action_spec=action_spec,
                            actor_network_ctor=actor_network,
                            critic_network_ctor=critic_network,
                            reward_weights=[1, 2, 3],
                            env=env,
                            config=config,
                            num_critic_replicas=num_critic_replicas,
                            use_parallel_network=num_critic_replicas > 1,
                            actor_optimizer=alf.optimizers.Adam(lr=1e-2),
                            critic_optimizer=alf.optimizers.Adam(lr=1e-2),
                            debug_summaries=False,
                            name="MyDDPG")

        for _ in range(500):
            alg.train_iter()

        eval_env.reset()
        epsilon_greedy = 0.0
        eval_time_step = unroll(eval_env, alg, steps_per_episode - 1,
                                epsilon_greedy)
        print(eval_time_step.reward.mean())

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=2e-1)
Example #5
0
    def test_sac_algorithm(self, use_parallel_network, reward_dim):
        num_env = 1
        config = TrainerConfig(
            root_dir="dummy",
            unroll_length=1,
            mini_batch_length=2,
            mini_batch_size=64,
            initial_collect_steps=500,
            whole_replay_buffer_training=False,
            clear_replay_buffer=False,
            num_envs=1,
        )
        env_class = PolicyUnittestEnv
        steps_per_episode = 13
        env = env_class(num_env,
                        steps_per_episode,
                        action_type=ActionType.Continuous,
                        reward_dim=reward_dim)

        eval_env = env_class(100,
                             steps_per_episode,
                             action_type=ActionType.Continuous,
                             reward_dim=reward_dim)

        obs_spec = env._observation_spec
        action_spec = env._action_spec

        fc_layer_params = (10, 10)

        continuous_projection_net_ctor = partial(
            alf.networks.NormalProjectionNetwork,
            state_dependent_std=True,
            scale_distribution=True,
            std_transform=clipped_exp)

        actor_network = partial(
            ActorDistributionNetwork,
            fc_layer_params=fc_layer_params,
            continuous_projection_net_ctor=continuous_projection_net_ctor)

        critic_network = partial(CriticNetwork,
                                 output_tensor_spec=env.reward_spec(),
                                 joint_fc_layer_params=fc_layer_params)

        alg = SacAlgorithm(observation_spec=obs_spec,
                           action_spec=action_spec,
                           actor_network_cls=actor_network,
                           critic_network_cls=critic_network,
                           use_parallel_network=use_parallel_network,
                           use_entropy_reward=reward_dim == 1,
                           env=env,
                           config=config,
                           actor_optimizer=alf.optimizers.Adam(lr=1e-2),
                           critic_optimizer=alf.optimizers.Adam(lr=1e-2),
                           alpha_optimizer=alf.optimizers.Adam(lr=1e-2),
                           debug_summaries=False,
                           name="MySAC")

        eval_env.reset()
        for i in range(700):
            alg.train_iter()
            if i < config.initial_collect_steps:
                continue
            eval_env.reset()
            eval_time_step = unroll(eval_env, alg, steps_per_episode - 1)
            logging.log_every_n_seconds(
                logging.INFO,
                "%d reward=%f" % (i, float(eval_time_step.reward.mean())),
                n_seconds=1)

        self.assertAlmostEqual(1.0,
                               float(eval_time_step.reward.mean()),
                               delta=0.3)