def create_bc_trainer(dummy_config, is_discrete=False): mock_env = mock.Mock() if is_discrete: mock_brain = mb.create_mock_pushblock_brain() mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=70) else: mock_brain = mb.create_mock_3dball_brain() mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config trainer_parameters["summary_path"] = "tmp" trainer_parameters["model_path"] = "tmp" trainer_parameters["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/test.demo") trainer = BCTrainer(mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0) trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100) return trainer, env
def create_ppo_policy_mock( mock_env, dummy_config, reward_signal_config, use_rnn, use_discrete, use_visual ): if not use_visual: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", vector_action_space_size=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_observation_space_size=VECTOR_OBS_SPACE, ) mock_braininfo = mb.create_mock_braininfo( num_agents=NUM_AGENTS, num_vector_observations=VECTOR_OBS_SPACE, num_vector_acts=sum( DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE ), discrete=use_discrete, ) else: mock_brain = mb.create_mock_brainparams( vector_action_space_type="discrete" if use_discrete else "continuous", vector_action_space_size=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_observation_space_size=0, number_visual_observations=1, ) mock_braininfo = mb.create_mock_braininfo( num_agents=NUM_AGENTS, num_vis_observations=1, num_vector_acts=sum( DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE ), discrete=use_discrete, ) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["reward_signals"].update(reward_signal_config) trainer_parameters["use_recurrent"] = use_rnn policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) return env, policy
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() trainer = mock.Mock() processor = AgentProcessor( trainer, policy, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1, 0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1, 0.1], "log_probs": [0.1, 0.1], } mock_braininfo = mb.create_mock_braininfo( num_agents=2, num_vector_observations=8, num_vector_acts=2, num_vis_observations=num_vis_obs, ) for i in range(5): processor.add_experiences(mock_braininfo, mock_braininfo, fake_action_outputs) # Assert that two trajectories have been added to the Trainer assert len(trainer.process_trajectory.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = trainer.process_trajectory.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0
def test_rl_trainer(add_policy_outputs, add_rewards_outputs, num_vis_obs): trainer = create_rl_trainer() trainer.policy = create_mock_policy() fake_action_outputs = { "action": [0.1, 0.1], "value_heads": {}, "entropy": np.array([1.0]), "learning_rate": 1.0, } mock_braininfo = mb.create_mock_braininfo( num_agents=2, num_vector_observations=8, num_vector_acts=2, num_vis_observations=num_vis_obs, ) trainer.add_experiences( create_mock_all_brain_info(mock_braininfo), create_mock_all_brain_info(mock_braininfo), fake_action_outputs, ) # Remove one of the agents next_mock_braininfo = mb.create_mock_braininfo( num_agents=1, num_vector_observations=8, num_vector_acts=2, num_vis_observations=num_vis_obs, ) brain_info = trainer.construct_curr_info(next_mock_braininfo) # assert construct_curr_info worked properly assert len(brain_info.agents) == 1 assert len(brain_info.visual_observations) == num_vis_obs assert len(brain_info.vector_observations) == 1 assert len(brain_info.previous_vector_actions) == 1 # Test end episode trainer.end_episode() for agent_id in trainer.episode_steps: assert trainer.episode_steps[agent_id] == 0 assert len(trainer.training_buffer[agent_id]["action"]) == 0 for rewards in trainer.collected_rewards.values(): for agent_id in rewards: assert rewards[agent_id] == 0
def create_ppo_policy_with_bc_mock(mock_env, mock_brain, dummy_config, use_rnn, demo_file): mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config model_path = env.brain_names[0] trainer_parameters["model_path"] = model_path trainer_parameters["keep_checkpoints"] = 3 trainer_parameters["use_recurrent"] = use_rnn trainer_parameters["pretraining"]["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file) policy = PPOPolicy(0, mock_brain, trainer_parameters, False, False) return env, policy
def test_agentprocessor(num_vis_obs): policy = create_mock_policy() tqueue = mock.Mock() name_behavior_id = "test_brain_name" processor = AgentProcessor( policy, name_behavior_id, max_trajectory_length=5, stats_reporter=StatsReporter("testcat"), ) fake_action_outputs = { "action": [0.1, 0.1], "entropy": np.array([1.0], dtype=np.float32), "learning_rate": 1.0, "pre_action": [0.1, 0.1], "log_probs": [0.1, 0.1], } mock_braininfo = mb.create_mock_braininfo( num_agents=2, num_vector_observations=8, num_vector_acts=2, num_vis_observations=num_vis_obs, ) fake_action_info = ActionInfo( action=[0.1, 0.1], value=[0.1, 0.1], outputs=fake_action_outputs, agents=mock_braininfo.agents, ) processor.publish_trajectory_queue(tqueue) # This is like the initial state after the env reset processor.add_experiences(mock_braininfo, ActionInfo([], [], {}, [])) for _ in range(5): processor.add_experiences(mock_braininfo, fake_action_info) # Assert that two trajectories have been added to the Trainer assert len(tqueue.put.call_args_list) == 2 # Assert that the trajectory is of length 5 trajectory = tqueue.put.call_args_list[0][0][0] assert len(trajectory.steps) == 5 # Assert that the AgentProcessor is empty assert len(processor.experience_buffers[0]) == 0
def create_policy_with_bc_mock(mock_env, mock_brain, trainer_config, use_rnn, demo_file): mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() model_path = env.external_brain_names[0] trainer_config["model_path"] = model_path trainer_config["keep_checkpoints"] = 3 trainer_config["use_recurrent"] = use_rnn trainer_config["behavioral_cloning"]["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/" + demo_file) policy = (PPOPolicy(0, mock_brain, trainer_config, False, False) if trainer_config["trainer"] == "ppo" else SACPolicy( 0, mock_brain, trainer_config, False, False)) return env, policy
def test_bc_trainer(mock_env, dummy_config): mock_brain = mb.create_mock_3dball_brain() mock_braininfo = mb.create_mock_braininfo(num_agents=12, num_vector_observations=8) mb.setup_mock_unityenvironment(mock_env, mock_brain, mock_braininfo) env = mock_env() trainer_parameters = dummy_config trainer_parameters["summary_path"] = "tmp" trainer_parameters["model_path"] = "tmp" trainer_parameters["demo_path"] = ( os.path.dirname(os.path.abspath(__file__)) + "/test.demo") trainer = BCTrainer(mock_brain, trainer_parameters, training=True, load=False, seed=0, run_id=0) trainer.demonstration_buffer = mb.simulate_rollout(env, trainer.policy, 100) trainer.update_policy() assert len(trainer.stats["Losses/Cloning Loss"]) > 0 trainer.increment_step(1) assert trainer.step == 1