def test_apex_ddpg_compilation_and_per_worker_epsilon_values(self): """Test whether an APEX-DDPGTrainer can be built on all frameworks.""" config = apex_ddpg.APEX_DDPG_DEFAULT_CONFIG.copy() config["num_workers"] = 2 config["prioritized_replay"] = True config["timesteps_per_iteration"] = 100 config["min_iter_time_s"] = 1 config["learning_starts"] = 0 config["optimizer"]["num_replay_buffer_shards"] = 1 num_iterations = 1 for _ in framework_iterator(config): plain_config = config.copy() trainer = apex_ddpg.ApexDDPGTrainer(config=plain_config, env="Pendulum-v1") # Test per-worker scale distribution. infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) scale = [i["cur_scale"] for i in infos] expected = [ 0.4**(1 + (i + 1) / float(config["num_workers"] - 1) * 7) for i in range(config["num_workers"]) ] check(scale, [0.0] + expected) for _ in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) # Test again per-worker scale distribution # (should not have changed). infos = trainer.workers.foreach_policy( lambda p, _: p.get_exploration_state()) scale = [i["cur_scale"] for i in infos] check(scale, [0.0] + expected) trainer.stop()
input = tf.keras.layers.Input(obs_space.shape, dtype=obs_space.dtype) output = tf.keras.layers.Dense(num_outputs, activation=None) self.base_model = tf.keras.models.Sequential([input, output]) self.register_variables(self.base_model.variables) def forward(self, input_dict, state, seq_lens): return self.base_model(input_dict["obs"]), [] ModelCatalog.register_custom_model("MLPModel", MLPModel) ModelCatalog.register_custom_model("MLPModelV2", MLPModelV2) if algorithm == 'A2C': RLAgent = a2c.A2CTrainer(env=env_name, config=config) elif algorithm == 'APEX_DDPG': RLAgent = apex.ApexDDPGTrainer(env=env_name, config=config) elif algorithm == 'DDPG': RLAgent = ddpg.DDPGTrainer(env=env_name, config=config) elif algorithm == 'IMPALA': RLAgent = impala.ImpalaTrainer(env=env_name, config=config) elif algorithm == 'PPO': RLAgent = ppo.PPOTrainer(env=env_name, config=config) elif algorithm == 'SAC': RLAgent = sac.SACTrainer(env=env_name, config=config) elif algorithm == 'TD3': RLAgent = td3.TD3Trainer(env=env_name, config=config) RLAgent.restore(checkpoint_path) num_runs = 50 totalRewards = np.empty((num_runs, ))