def test_recurrent_poca(action_sizes, is_multiagent): if is_multiagent: # This is not a recurrent environment, just check if LSTM doesn't crash env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=action_sizes, num_agents=2) else: # Actually test LSTM here env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes) new_network_settings = attr.evolve( POCA_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( POCA_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128, ) config = attr.evolve( POCA_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=500 if is_multiagent else 6000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None if is_multiagent else 0.9)
def test_visual_advanced_sac(vis_encode_type, num_visual): env = SimpleEnvironment( [BRAIN_NAME], action_sizes=(0, 1), num_visual=num_visual, num_vector=0, step_size=0.5, vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), ) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4, buffer_init_steps=0, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=100, ) # The number of steps is pretty small for these encoders check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
def test_gail_visual_sac(simple_record, action_sizes): demo_path = simple_record(action_sizes, num_visual=1, num_vector=0) env = SimpleEnvironment( [BRAIN_NAME], num_visual=1, num_vector=0, action_sizes=action_sizes, step_size=0.2, ) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } hyperparams = attr.evolve(SAC_TORCH_CONFIG.hyperparameters, learning_rate=3e-4, batch_size=16) config = attr.evolve( SAC_TORCH_CONFIG, reward_signals=reward_signals, hyperparameters=hyperparams, behavioral_cloning=bc_settings, max_steps=500, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_simple_asymm_ghost_fails(action_sizes): # Make opponent for asymmetric case brain_name_opp = BRAIN_NAME + "Opp" env = SimpleEnvironment( [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes) # This config should fail because the team that us not learning when both have reached # max step should be executing the initial, untrained poliy. self_play_settings = SelfPlaySettings( play_against_latest_model_ratio=0.0, save_steps=5000, swap_steps=5000, team_change=2000, ) config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=3000) check_environment_trains(env, { BRAIN_NAME: config, brain_name_opp: config }, success_threshold=None) processed_rewards = [ default_reward_processor(rewards) for rewards in env.final_rewards.values() ] success_threshold = 0.9 assert any(reward > success_threshold for reward in processed_rewards) and any( reward < success_threshold for reward in processed_rewards)
def test_simple_ghost(action_sizes): env = SimpleEnvironment( [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes ) self_play_settings = SelfPlaySettings( play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=2000 ) config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500) check_environment_trains(env, {BRAIN_NAME: config})
def test_2d_ppo(action_sizes): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=640 ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=10000 ) check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_visual_ppo(num_visual): env = SimpleEnvironment([BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams) check_environment_trains(env, {BRAIN_NAME: config})
def test_visual_poca(num_visual): env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=(0, 1), num_agents=2, num_visual=num_visual) new_hyperparams = attr.evolve(POCA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve(POCA_TORCH_CONFIG, hyperparameters=new_hyperparams) check_environment_trains(env, {BRAIN_NAME: config})
def test_2d_sac(action_sizes): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.8) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, buffer_init_steps=2000 ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=6000 ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.8)
def test_visual_ppo(num_visual, use_discrete): env = SimpleEnvironment( [BRAIN_NAME], use_discrete=use_discrete, num_visual=num_visual, num_vector=0, step_size=0.2, ) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams) check_environment_trains(env, {BRAIN_NAME: config})
def test_var_len_obs_ppo(num_vis, num_vector, num_var_len, action_sizes): env = SimpleEnvironment( [BRAIN_NAME], action_sizes=action_sizes, num_visual=num_vis, num_vector=num_vector, num_var_len=num_var_len, step_size=0.2, ) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams) check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_ppo(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_network_settings = attr.evolve(PPO_TORCH_CONFIG.network_settings) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024 ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=10000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_visual_sac(num_visual, action_sizes): env = SimpleEnvironment( [BRAIN_NAME], action_sizes=action_sizes, num_visual=num_visual, num_vector=0, step_size=0.2, ) new_hyperparams = attr.evolve(SAC_TORCH_CONFIG.hyperparameters, batch_size=16, learning_rate=3e-4) config = attr.evolve(SAC_TORCH_CONFIG, hyperparameters=new_hyperparams) check_environment_trains(env, {BRAIN_NAME: config})
def test_simple_asymm_ghost(action_sizes): # Make opponent for asymmetric case brain_name_opp = BRAIN_NAME + "Opp" env = SimpleEnvironment( [BRAIN_NAME + "?team=0", brain_name_opp + "?team=1"], action_sizes=action_sizes ) self_play_settings = SelfPlaySettings( play_against_latest_model_ratio=1.0, save_steps=10000, swap_steps=10000, team_change=400, ) config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=4000) check_environment_trains(env, {BRAIN_NAME: config, brain_name_opp: config})
def test_gail(simple_record, action_sizes, trainer_config): demo_path = simple_record(action_sizes) env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes, step_size=0.2) bc_settings = BehavioralCloningSettings(demo_path=demo_path, steps=1000) reward_signals = { RewardSignalType.GAIL: GAILSettings(encoding_size=32, demo_path=demo_path) } config = attr.evolve( trainer_config, reward_signals=reward_signals, behavioral_cloning=bc_settings, max_steps=500, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_hybrid_visual_sac(num_visual): env = SimpleEnvironment( [BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1) ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, buffer_size=50000, batch_size=128, learning_rate=3.0e-4, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=3000 ) check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_sac(action_size): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_size, step_size=0.8) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, buffer_size=50000, batch_size=256, buffer_init_steps=0, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=2200 ) check_environment_trains( env, {BRAIN_NAME: config}, success_threshold=0.9, training_seed=1336 )
def test_hybrid_visual_ppo(num_visual, training_seed): env = SimpleEnvironment([BRAIN_NAME], num_visual=num_visual, num_vector=0, action_sizes=(1, 1)) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, batch_size=64, buffer_size=1024, learning_rate=1e-4, ) config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, max_steps=8000) check_environment_trains(env, {BRAIN_NAME: config}, training_seed=training_seed)
def test_subprocess_failing_step(num_envs): def failing_step_env_factory(_worker_id, _config): env = UnexpectedExceptionEnvironment( ["1D"], use_discrete=True, to_raise=CustomTestOnlyException ) return env env_manager = SubprocessEnvManager(failing_step_env_factory, RunOptions()) # Expect the exception raised to be routed back up to the top level. with pytest.raises(CustomTestOnlyException): check_environment_trains( failing_step_env_factory(0, []), {"1D": ppo_dummy_config()}, env_manager=env_manager, success_threshold=None, ) env_manager.close()
def test_simple_ghost_fails(action_sizes): env = SimpleEnvironment( [BRAIN_NAME + "?team=0", BRAIN_NAME + "?team=1"], action_sizes=action_sizes ) # This config should fail because the ghosted policy is never swapped with a competent policy. # Swap occurs after max step is reached. self_play_settings = SelfPlaySettings( play_against_latest_model_ratio=1.0, save_steps=2000, swap_steps=4000 ) config = attr.evolve(PPO_TORCH_CONFIG, self_play=self_play_settings, max_steps=2500) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None) processed_rewards = [ default_reward_processor(rewards) for rewards in env.final_rewards.values() ] success_threshold = 0.9 assert any(reward > success_threshold for reward in processed_rewards) and any( reward < success_threshold for reward in processed_rewards )
def test_var_len_obs_and_goal_ppo(num_vis, num_vector, num_var_len, action_sizes, conditioning_type): env = SimpleEnvironment( [BRAIN_NAME], action_sizes=action_sizes, num_visual=num_vis, num_vector=num_vector, num_var_len=num_var_len, step_size=0.2, goal_indices=[0], ) new_network = attr.evolve(POCA_TORCH_CONFIG.network_settings, goal_conditioning_type=conditioning_type) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve(PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network) check_environment_trains(env, {BRAIN_NAME: config})
def test_hybrid_recurrent_ppo(): env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5) new_network_settings = attr.evolve( PPO_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=512, ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=3000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_hybrid_recurrent_sac(): env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=256, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=4000, ) check_environment_trains(env, {BRAIN_NAME: config})
def test_subprocess_env_endtoend(num_envs): def simple_env_factory(worker_id, config): env = SimpleEnvironment(["1D"], use_discrete=True) return env env_manager = SubprocessEnvManager(simple_env_factory, EngineConfig.default_config(), num_envs) # Run PPO using env_manager check_environment_trains( simple_env_factory(0, []), {"1D": ppo_dummy_config()}, env_manager=env_manager, success_threshold=None, ) # Note we can't check the env's rewards directly (since they're in separate processes) so we # check the StatsReporter's debug stat writer's last reward. assert isinstance(StatsReporter.writers[0], DebugWriter) assert all(val > 0.7 for val in StatsReporter.writers[0].get_last_rewards().values()) env_manager.close()
def test_var_len_obs_and_goal_poca(num_vis, num_vector, num_var_len, conditioning_type): env = MultiAgentEnvironment( [BRAIN_NAME], action_sizes=(0, 1), num_visual=num_vis, num_vector=num_vector, num_var_len=num_var_len, step_size=0.2, num_agents=2, goal_indices=[0], ) new_network = attr.evolve(POCA_TORCH_CONFIG.network_settings, goal_conditioning_type=conditioning_type) new_hyperparams = attr.evolve(POCA_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve( POCA_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network, max_steps=5000, ) check_environment_trains(env, {BRAIN_NAME: config})
def test_recurrent_sac(use_discrete): step_size = 0.2 if use_discrete else 0.5 env = MemoryEnvironment( [BRAIN_NAME], use_discrete=use_discrete, step_size=step_size ) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=256, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=2000, ) check_environment_trains(env, {BRAIN_NAME: config})
def test_visual_advanced_ppo(vis_encode_type, num_visual): env = SimpleEnvironment( [BRAIN_NAME], use_discrete=True, num_visual=num_visual, num_vector=0, step_size=0.5, vis_obs_size=(5, 5, 5) if vis_encode_type == "match3" else (36, 36, 3), ) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, vis_encode_type=EncoderType(vis_encode_type)) new_hyperparams = attr.evolve(PPO_TORCH_CONFIG.hyperparameters, learning_rate=3.0e-4) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=900, summary_freq=100, ) # The number of steps is pretty small for these encoders check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.5)
def test_recurrent_sac(action_sizes): step_size = 0.2 if action_sizes == (0, 1) else 0.5 env = MemoryEnvironment( [BRAIN_NAME], action_sizes=action_sizes, step_size=step_size ) new_networksettings = attr.evolve( SAC_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TORCH_CONFIG.hyperparameters, batch_size=256, learning_rate=3e-4, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=4000, ) check_environment_trains(env, {BRAIN_NAME: config}, training_seed=1337)
def test_simple_ppo(use_discrete): env = SimpleEnvironment([BRAIN_NAME], use_discrete=use_discrete) config = attr.evolve(PPO_TORCH_CONFIG) check_environment_trains(env, {BRAIN_NAME: config})
def test_simple_ppo(action_sizes): env = SimpleEnvironment([BRAIN_NAME], action_sizes=action_sizes) config = attr.evolve(PPO_TORCH_CONFIG) check_environment_trains(env, {BRAIN_NAME: config})