def test_sac_save_load_buffer(tmpdir, dummy_config): mock_specs = mb.setup_test_behavior_specs( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.hyperparameters.save_replay_buffer = True trainer = SACTrainer("test", 1, trainer_params, True, False, 0, "testdir") behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) policy = trainer.create_policy(behavior_id, mock_specs) trainer.add_policy(behavior_id, policy) trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.behavior_spec) buffer_len = trainer.update_buffer.num_experiences trainer.save_model() # Wipe Trainer and try to load trainer2 = SACTrainer("test", 1, trainer_params, True, True, 0, "testdir") policy = trainer2.create_policy(behavior_id, mock_specs) trainer2.add_policy(behavior_id, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def test_sac_save_load_buffer(tmpdir, dummy_config): mock_brain = mb.setup_mock_brain( False, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.hyperparameters.save_replay_buffer = True trainer = SACTrainer( mock_brain.brain_name, 1, trainer_params, True, False, 0, "testdir" ) policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) trainer.update_buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, policy.brain) buffer_len = trainer.update_buffer.num_experiences trainer.save_model(mock_brain.brain_name) # Wipe Trainer and try to load trainer2 = SACTrainer( mock_brain.brain_name, 1, trainer_params, True, True, 0, "testdir" ) policy = trainer2.create_policy(mock_brain.brain_name, mock_brain) trainer2.add_policy(mock_brain.brain_name, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def test_add_get_policy(sac_optimizer, dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} sac_optimizer.return_value = mock_optimizer dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy(brain_params.brain_name, policy) assert trainer.get_policy(brain_params.brain_name) == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 assert trainer.next_summary_step > 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy(brain_params, policy)
def test_sac_save_load_buffer(tmpdir, dummy_config): env, mock_brain, _ = mb.setup_mock_env_and_brains( mock.Mock(), False, False, num_agents=NUM_AGENTS, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params["summary_path"] = str(tmpdir) trainer_params["model_path"] = str(tmpdir) trainer_params["save_replay_buffer"] = True trainer = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, False, 0, 0) policy = trainer.create_policy(mock_brain) trainer.add_policy(mock_brain.brain_name, policy) trainer.update_buffer = mb.simulate_rollout(env, trainer.policy, BUFFER_INIT_SAMPLES) buffer_len = trainer.update_buffer.num_experiences trainer.save_model(mock_brain.brain_name) # Wipe Trainer and try to load trainer2 = SACTrainer(mock_brain.brain_name, 1, trainer_params, True, True, 0, 0) policy = trainer2.create_policy(mock_brain) trainer2.add_policy(mock_brain.brain_name, policy) assert trainer2.update_buffer.num_experiences == buffer_len
def test_process_trajectory(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) dummy_config["summary_path"] = "./summaries/test_trainer_summary" dummy_config["model_path"] = "./models/test_trainer_models/TestModel" trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=15, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0)
def test_add_get_policy(sac_optimizer, mock_create_saver, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} sac_optimizer.return_value = mock_optimizer trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=TFPolicy) policy.get_current_step.return_value = 2000 behavior_id = BehaviorIdentifiers.from_name_behavior_id(trainer.brain_name) trainer.add_policy(behavior_id, policy) assert trainer.get_policy(behavior_id.behavior_id) == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000
def test_add_get_policy(sac_optimizer, dummy_config): mock_optimizer = mock.Mock() mock_optimizer.reward_signals = {} sac_optimizer.return_value = mock_optimizer trainer = SACTrainer("test", 0, dummy_config, True, False, 0, "0") policy = mock.Mock(spec=NNPolicy) policy.get_current_step.return_value = 2000 trainer.add_policy("test", policy) assert trainer.get_policy("test") == policy # Make sure the summary steps were loaded properly assert trainer.get_step == 2000 # Test incorrect class of policy policy = mock.Mock() with pytest.raises(RuntimeError): trainer.add_policy("test", policy)
def test_advance(dummy_config): brain_params = make_brain_parameters( discrete_action=False, visual_inputs=0, vec_obs_size=6 ) dummy_config.hyperparameters.steps_per_update = 20 dummy_config.hyperparameters.reward_signal_steps_per_update = 20 dummy_config.hyperparameters.buffer_init_steps = 0 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert ( trainer.stats_reporter.get_stats_summaries("Policy/Extrinsic Reward").mean > 0 ) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait() # Call add_policy and check that we update the correct number of times. # This is to emulate a load from checkpoint. policy = trainer.create_policy(brain_params.brain_name, brain_params) policy.get_current_step = lambda: 200 trainer.add_policy(brain_params.brain_name, policy) trainer.optimizer.update = mock.Mock() trainer.optimizer.update_reward_signals = mock.Mock() trainer.optimizer.update_reward_signals.return_value = {} trainer.optimizer.update.return_value = {} trajectory_queue.put(trajectory) trainer.advance() # Make sure we did exactly 1 update assert trainer.optimizer.update.call_count == 1 assert trainer.optimizer.update_reward_signals.call_count == 1
def test_advance(dummy_config): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) dummy_config["output_path"] = "./results/test_trainer_models/TestModel" dummy_config["steps_per_update"] = 20 trainer = SACTrainer(brain_params, 0, dummy_config, True, False, 0, "0") policy = trainer.create_policy(brain_params.brain_name, brain_params) trainer.add_policy(brain_params.brain_name, policy) trajectory_queue = AgentManagerQueue("testbrain") policy_queue = AgentManagerQueue("testbrain") trainer.subscribe_trajectory_queue(trajectory_queue) trainer.publish_policy_queue(policy_queue) trajectory = make_fake_trajectory( length=15, max_step_complete=True, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that trainer put trajectory in update buffer assert trainer.update_buffer.num_experiences == 15 # Check that the stats are being collected as episode isn't complete for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent > 0 # Add a terminal trajectory trajectory = make_fake_trajectory( length=6, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() # Check that the stats are reset as episode is finished for reward in trainer.collected_rewards.values(): for agent in reward.values(): assert agent == 0 assert trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").num > 0 # Assert we're not just using the default values assert (trainer.stats_reporter.get_stats_summaries( "Policy/Extrinsic Reward").mean > 0) # Make sure there is a policy on the queue policy_queue.get_nowait() # Add another trajectory. Since this is less than 20 steps total (enough for) # two updates, there should NOT be a policy on the queue. trajectory = make_fake_trajectory( length=5, max_step_complete=False, vec_obs_size=6, num_vis_obs=0, action_space=[2], is_discrete=False, ) trajectory_queue.put(trajectory) trainer.advance() with pytest.raises(AgentManagerQueue.Empty): policy_queue.get_nowait()