def test_networkbody_lstm(): torch.manual_seed(0) obs_size = 4 seq_len = 6 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = torch.ones((seq_len, obs_size)) for _ in range(300): encoded, _ = networkbody([sample_obs], memories=torch.ones(1, 1, 12), sequence_length=seq_len) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None, normalize=True) obs_spec = create_observation_specs_with_shapes([(obs_size, )]) act_size = 2 mask = torch.ones([1, act_size * 2]) stream_names = [f"stream_name{n}" for n in range(4)] # action_spec = ActionSpec.create_continuous(act_size[0]) action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) actor = ac_type(obs_spec, network_settings, action_spec, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones( (1, network_settings.memory.sequence_length, actor.memory_size)) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get action stats and_value action, log_probs, entropies, value_out, mem_out = actor.get_action_stats_and_value( [sample_obs], memories=memories, masks=mask) if lstm: assert action.continuous_tensor.shape == (64, 2) else: assert action.continuous_tensor.shape == (1, 2) assert len(action.discrete_list) == 2 for _disc in action.discrete_list: if lstm: assert _disc.shape == (64, 1) else: assert _disc.shape == (1, 1) if mem_out is not None: assert mem_out.shape == memories.shape for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_bad_config(): brain_params = make_brain_parameters(discrete_action=False, visual_inputs=0, vec_obs_size=6) # Test that we throw an error if we have sequence length greater than batch size with pytest.raises(TrainerConfigError): TrainerSettings( network_settings=NetworkSettings( memory=NetworkSettings.MemorySettings(sequence_length=64)), hyperparameters=PPOSettings(batch_size=32), ) _ = PPOTrainer(brain_params, 0, dummy_config, True, False, 0, "0")
def test_multinetworkbody_lstm(with_actions): torch.manual_seed(0) obs_size = 4 act_size = 2 seq_len = 16 n_agents = 3 network_settings = NetworkSettings(memory=NetworkSettings.MemorySettings( sequence_length=seq_len, memory_size=12)) obs_shapes = [(obs_size, )] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-4) sample_obs = [[0.1 * torch.ones((seq_len, obs_size))] for _ in range(n_agents)] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((seq_len, 2)), [0.1 * torch.ones(seq_len) for _ in range(act_size)], ) for _ in range(n_agents - 1) ] for _ in range(300): if with_actions: encoded, _ = networkbody( obs_only=sample_obs[:1], obs=sample_obs[1:], actions=sample_act, memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) else: encoded, _ = networkbody( obs_only=sample_obs, obs=[], actions=[], memories=torch.ones(1, 1, 12), sequence_length=seq_len, ) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_networkbody_visual(): torch.manual_seed(0) vec_obs_size = 4 obs_size = (84, 84, 3) network_settings = NetworkSettings() obs_shapes = [(vec_obs_size, ), obs_size] networkbody = NetworkBody(create_observation_specs_with_shapes(obs_shapes), network_settings) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = 0.1 * torch.ones((1, 84, 84, 3)) sample_vec_obs = torch.ones((1, vec_obs_size)) obs = [sample_vec_obs] + [sample_obs] for _ in range(150): encoded, _ = networkbody(obs) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten().tolist(): assert _enc == pytest.approx(1.0, abs=0.1)
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = NNPolicy( 0, mock_behavior_specs, trainer_config, False, "test", False, tanhresample, tanhresample, ) with policy.graph.as_default(): bc_module = BCModule( policy, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, settings=bc_settings, ) policy.initialize_or_load( ) # Normally the optimizer calls this after the BCModule is created return bc_module
def test_normalization(): behavior_spec = mb.setup_test_behavior_specs(use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) policy = NNPolicy( 0, behavior_spec, TrainerSettings(network_settings=NetworkSettings(normalize=True)), False, "testdir", False, ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 6 assert mean[0] == 0.5 # Note: variance is divided by number of steps, and initialized to 1 to avoid # divide by 0. The right answer is 0.25 assert (variance[0] - 1) / steps == 0.25 # Make another update, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1, )], action_space=[2], ) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run([ policy.normalization_steps, policy.running_mean, policy.running_variance ]) assert steps == 16 assert mean[0] == 0.8125 assert (variance[0] - 1) / steps == pytest.approx(0.152, abs=0.01)
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_brain = mb.setup_mock_brain( use_discrete, False, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16 ) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config trainer = PPOTrainer(mock_brain.brain_name, 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(mock_brain.brain_name, mock_brain) trainer.add_policy(mock_brain.brain_name, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_brain) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] trainer.update_buffer = buffer trainer._update_policy()
def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() self._action_spec = specs.action_spec state_encoder_settings = NetworkSettings( normalize=False, hidden_units=settings.encoding_size, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=None, ) self._state_encoder = NetworkBody(specs.observation_specs, state_encoder_settings) self._action_flattener = ActionFlattener(self._action_spec) self.inverse_model_action_encoding = torch.nn.Sequential( LinearEncoder(2 * settings.encoding_size, 1, 256)) if self._action_spec.continuous_size > 0: self.continuous_action_prediction = linear_layer( 256, self._action_spec.continuous_size) if self._action_spec.discrete_size > 0: self.discrete_action_prediction = linear_layer( 256, sum(self._action_spec.discrete_branches)) self.forward_model_next_state_prediction = torch.nn.Sequential( LinearEncoder( settings.encoding_size + self._action_flattener.flattened_size, 1, 256), linear_layer(256, settings.encoding_size), )
def test_normalizer_after_load(tmp_path): behavior_spec = mb.setup_test_behavior_specs( use_discrete=True, use_visual=False, vector_action_space=[2], vector_obs_space=1 ) time_horizon = 6 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) # Change half of the obs to 0 for i in range(3): trajectory.steps[i].obs[0] = np.zeros(1, dtype=np.float32) trainer_params = TrainerSettings(network_settings=NetworkSettings(normalize=True)) policy = TFPolicy(0, behavior_spec, trainer_params) trajectory_buffer = trajectory.to_agentbuffer() policy.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy.sess.run( [policy.normalization_steps, policy.running_mean, policy.running_variance] ) assert steps == 6 assert mean[0] == 0.5 assert variance[0] / steps == pytest.approx(0.25, abs=0.01) # Save ckpt and load into another policy path1 = os.path.join(tmp_path, "runid1") model_saver = TFModelSaver(trainer_params, path1) model_saver.register(policy) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 6) assert len(os.listdir(tmp_path)) > 0 policy1 = TFPolicy(0, behavior_spec, trainer_params) model_saver = TFModelSaver(trainer_params, path1, load=True) model_saver.register(policy1) model_saver.initialize_or_load(policy1) # Make another update to new policy, this time with all 1's time_horizon = 10 trajectory = make_fake_trajectory( length=time_horizon, max_step_complete=True, observation_shapes=[(1,)], action_spec=behavior_spec.action_spec, ) trajectory_buffer = trajectory.to_agentbuffer() policy1.update_normalization(trajectory_buffer["vector_obs"]) # Check that the running mean and variance is correct steps, mean, variance = policy1.sess.run( [policy1.normalization_steps, policy1.running_mean, policy1.running_variance] ) assert steps == 16 assert mean[0] == 0.8125 assert variance[0] / steps == pytest.approx(0.152, abs=0.01)
def create_optimizer_mock(trainer_config, reward_signal_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = trainer_config trainer_settings.reward_signals = reward_signal_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = NNPolicy(0, mock_specs, trainer_settings, False, "test", False, create_tf_graph=False) if trainer_settings.trainer_type == TrainerType.SAC: optimizer = SACOptimizer(policy, trainer_settings) else: optimizer = PPOOptimizer(policy, trainer_settings) return optimizer
def test_multinetworkbody_num_agents(with_actions): torch.manual_seed(0) act_size = 2 obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size,)] action_spec = ActionSpec(act_size, tuple(act_size for _ in range(act_size))) networkbody = MultiAgentNetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, action_spec ) sample_obs = [[0.1 * torch.ones((1, obs_size))]] # simulate baseline in POCA sample_act = [ AgentAction( 0.1 * torch.ones((1, 2)), [0.1 * torch.ones(1) for _ in range(act_size)] ) ] for n_agent, max_so_far in [(1, 1), (5, 5), (4, 5), (10, 10), (5, 10), (1, 10)]: if with_actions: encoded, _ = networkbody( obs_only=sample_obs * (n_agent - 1), obs=sample_obs, actions=sample_act ) else: encoded, _ = networkbody(obs_only=sample_obs * n_agent, obs=[], actions=[]) # look at the last value of the hidden units (the number of agents) target = (n_agent * 1.0 / max_so_far) * 2 - 1 assert abs(encoded[0, -1].item() - target) < 1e-6 assert encoded[0, -1].item() <= 1 assert encoded[0, -1].item() >= -1
def test_recurrent_poca(action_sizes, is_multiagent): if is_multiagent: # This is not a recurrent environment, just check if LSTM doesn't crash env = MultiAgentEnvironment([BRAIN_NAME], action_sizes=action_sizes, num_agents=2) else: # Actually test LSTM here env = MemoryEnvironment([BRAIN_NAME], action_sizes=action_sizes) new_network_settings = attr.evolve( POCA_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( POCA_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128, ) config = attr.evolve( POCA_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=500 if is_multiagent else 6000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=None if is_multiagent else 0.9)
def create_policy_mock( dummy_config: TrainerSettings, use_rnn: bool = False, use_discrete: bool = True, use_visual: bool = False, model_path: str = "", load: bool = False, seed: int = 0, ) -> TFPolicy: mock_spec = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = dummy_config trainer_settings.keep_checkpoints = 3 trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings() if use_rnn else None ) policy = TFPolicy( seed, mock_spec, trainer_settings, model_path=model_path, load=load ) return policy
def test_valuenetwork(): torch.manual_seed(0) obs_size = 4 num_outputs = 2 network_settings = NetworkSettings() obs_spec = create_observation_specs_with_shapes([(obs_size, )]) stream_names = [f"stream_name{n}" for n in range(4)] value_net = ValueNetwork(stream_names, obs_spec, network_settings, outputs_per_stream=num_outputs) optimizer = torch.optim.Adam(value_net.parameters(), lr=3e-3) for _ in range(50): sample_obs = torch.ones((1, obs_size)) values, _ = value_net([sample_obs]) loss = 0 for s_name in stream_names: assert values[s_name].shape == (1, num_outputs) # Try to force output to 1 loss += torch.nn.functional.mse_loss(values[s_name], torch.ones((1, num_outputs))) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for value in values.values(): for _out in value.tolist(): assert _out[0] == pytest.approx(1.0, abs=0.1)
def test_recurrent_sac(use_discrete): step_size = 0.5 if use_discrete else 0.2 env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete, step_size=step_size) new_networksettings = attr.evolve( SAC_TF_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16, sequence_length=16), ) new_hyperparams = attr.evolve( SAC_TF_CONFIG.hyperparameters, batch_size=128, learning_rate=1e-3, buffer_init_steps=1000, steps_per_update=2, ) config = attr.evolve( SAC_TF_CONFIG, hyperparameters=new_hyperparams, network_settings=new_networksettings, max_steps=5000, framework=FrameworkType.TENSORFLOW, ) _check_environment_trains(env, {BRAIN_NAME: config})
def __init__(self, specs: BehaviorSpec, settings: CuriositySettings) -> None: super().__init__() self._policy_specs = specs state_encoder_settings = NetworkSettings( normalize=False, hidden_units=settings.encoding_size, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=None, ) self._state_encoder = NetworkBody(specs.observation_shapes, state_encoder_settings) self._action_flattener = ModelUtils.ActionFlattener(specs) self.inverse_model_action_prediction = torch.nn.Sequential( LinearEncoder(2 * settings.encoding_size, 1, 256), linear_layer(256, self._action_flattener.flattened_size), ) self.forward_model_next_state_prediction = torch.nn.Sequential( LinearEncoder( settings.encoding_size + self._action_flattener.flattened_size, 1, 256), linear_layer(256, settings.encoding_size), )
def test_networkbody_vector(): torch.manual_seed(0) obs_size = 4 network_settings = NetworkSettings() obs_shapes = [(obs_size, )] networkbody = NetworkBody( create_observation_specs_with_shapes(obs_shapes), network_settings, encoded_act_size=2, ) optimizer = torch.optim.Adam(networkbody.parameters(), lr=3e-3) sample_obs = 0.1 * torch.ones((1, obs_size)) sample_act = 0.1 * torch.ones((1, 2)) for _ in range(300): encoded, _ = networkbody([sample_obs], sample_act) assert encoded.shape == (1, network_settings.hidden_units) # Try to force output to 1 loss = torch.nn.functional.mse_loss(encoded, torch.ones(encoded.shape)) optimizer.zero_grad() loss.backward() optimizer.step() # In the last step, values should be close to 1 for _enc in encoded.flatten(): assert _enc == pytest.approx(1.0, abs=0.1)
def test_actor_critic(ac_type, lstm): obs_size = 4 network_settings = NetworkSettings( memory=NetworkSettings.MemorySettings() if lstm else None) obs_shapes = [(obs_size, )] act_size = [2] stream_names = [f"stream_name{n}" for n in range(4)] actor = ac_type(obs_shapes, network_settings, ActionType.CONTINUOUS, act_size, stream_names) if lstm: sample_obs = torch.ones( (1, network_settings.memory.sequence_length, obs_size)) memories = torch.ones(( 1, network_settings.memory.sequence_length, network_settings.memory.memory_size, )) else: sample_obs = torch.ones((1, obs_size)) memories = torch.tensor([]) # memories isn't always set to None, the network should be able to # deal with that. # Test critic pass value_out, memories_out = actor.critic_pass([sample_obs], [], memories=memories) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) assert memories_out.shape == memories.shape else: assert value_out[stream].shape == (1, ) # Test get_dist_and_value dists, value_out, mem_out = actor.get_dist_and_value([sample_obs], [], memories=memories) if mem_out is not None: assert mem_out.shape == memories.shape for dist in dists: assert isinstance(dist, GaussianDistInstance) for stream in stream_names: if lstm: assert value_out[stream].shape == ( network_settings.memory.sequence_length, ) else: assert value_out[stream].shape == (1, )
def test_load_policy_different_hidden_units(tmp_path, vis_encode_type): path1 = os.path.join(tmp_path, "runid1") trainer_params = TrainerSettings() trainer_params.network_settings = NetworkSettings( hidden_units=12, vis_encode_type=EncoderType(vis_encode_type)) policy = create_policy_mock(trainer_params, use_visual=True) conv_params = [ mod for mod in policy.actor.parameters() if len(mod.shape) > 2 ] model_saver = TorchModelSaver(trainer_params, path1) model_saver.register(policy) model_saver.initialize_or_load(policy) policy.set_step(2000) mock_brain_name = "MockBrain" model_saver.save_checkpoint(mock_brain_name, 2000) # Try load from this path trainer_params2 = TrainerSettings() trainer_params2.network_settings = NetworkSettings( hidden_units=10, vis_encode_type=EncoderType(vis_encode_type)) model_saver2 = TorchModelSaver(trainer_params2, path1, load=True) policy2 = create_policy_mock(trainer_params2, use_visual=True) conv_params2 = [ mod for mod in policy2.actor.parameters() if len(mod.shape) > 2 ] # asserts convolutions have different parameters before load for conv1, conv2 in zip(conv_params, conv_params2): assert not torch.equal(conv1, conv2) # asserts layers still have different dimensions for mod1, mod2 in zip(policy.actor.parameters(), policy2.actor.parameters()): if mod1.shape[0] == 12: assert mod2.shape[0] == 10 model_saver2.register(policy2) model_saver2.initialize_or_load(policy2) # asserts convolutions have same parameters after load for conv1, conv2 in zip(conv_params, conv_params2): assert torch.equal(conv1, conv2) # asserts layers still have different dimensions for mod1, mod2 in zip(policy.actor.parameters(), policy2.actor.parameters()): if mod1.shape[0] == 12: assert mod2.shape[0] == 10
def __init__(self, specs: BehaviorSpec, settings: RNDSettings) -> None: super().__init__() state_encoder_settings = NetworkSettings( normalize=True, hidden_units=settings.encoding_size, num_layers=3, vis_encode_type=EncoderType.SIMPLE, memory=None, ) self._encoder = NetworkBody(specs.observation_specs, state_encoder_settings)
def test_trainer_update_policy( dummy_config, curiosity_dummy_config, use_discrete # noqa: F811 ): mock_behavior_spec = mb.setup_test_behavior_specs( use_discrete, False, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_params = dummy_config trainer_params.network_settings.memory = NetworkSettings.MemorySettings( memory_size=10, sequence_length=16) # Test curiosity reward signal trainer_params.reward_signals = curiosity_dummy_config mock_brain_name = "MockBrain" behavior_id = BehaviorIdentifiers.from_name_behavior_id(mock_brain_name) trainer = PPOTrainer("test", 0, trainer_params, True, False, 0, "0") policy = trainer.create_policy(behavior_id, mock_behavior_spec) trainer.add_policy(behavior_id, policy) # Test update with sequence length smaller than batch size buffer = mb.simulate_rollout(BUFFER_INIT_SAMPLES, mock_behavior_spec) # Mock out reward signal eval buffer["extrinsic_rewards"] = buffer["environment_rewards"] buffer["extrinsic_returns"] = buffer["environment_rewards"] buffer["extrinsic_value_estimates"] = buffer["environment_rewards"] buffer["curiosity_rewards"] = buffer["environment_rewards"] buffer["curiosity_returns"] = buffer["environment_rewards"] buffer["curiosity_value_estimates"] = buffer["environment_rewards"] buffer["advantages"] = buffer["environment_rewards"] # NOTE: This is because TF outputs the log probs of all actions whereas PyTorch does not if use_discrete: n_agents = len(buffer["discrete_log_probs"]) buffer["discrete_log_probs"].reset_field() for _ in range(n_agents): buffer["discrete_log_probs"].append( np.ones( int(sum(mock_behavior_spec.action_spec.discrete_branches)), dtype=np.float32, )) else: n_agents = len(buffer["continuous_log_probs"]) buffer["continuous_log_probs"].reset_field() for _ in range(n_agents): buffer["continuous_log_probs"].append( np.ones(mock_behavior_spec.action_spec.continuous_size, dtype=np.float32)) trainer.update_buffer = buffer trainer._update_policy()
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE if not use_visual else 0, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=12) if use_rnn else None) policy = TorchPolicy(0, mock_brain, trainer_settings) optimizer = TorchSACOptimizer(policy, trainer_settings) return optimizer
def convert_behaviors(old_trainer_config: Dict[str, Any]) -> Dict[str, Any]: all_behavior_config_dict = {} default_config = old_trainer_config.get("default", {}) for behavior_name, config in old_trainer_config.items(): if behavior_name != "default": config = default_config.copy() config.update(old_trainer_config[behavior_name]) # Convert to split TrainerSettings, Hyperparameters, NetworkSettings # Set trainer_type and get appropriate hyperparameter settings try: trainer_type = config["trainer"] except KeyError: raise TrainerConfigError( "Config doesn't specify a trainer type. " "Please specify trainer: in your config." ) new_config = {} new_config["trainer_type"] = trainer_type hyperparam_cls = TrainerType(trainer_type).to_settings() # Try to absorb as much as possible into the hyperparam_cls new_config["hyperparameters"] = cattr.structure(config, hyperparam_cls) # Try to absorb as much as possible into the network settings new_config["network_settings"] = cattr.structure(config, NetworkSettings) # Deal with recurrent try: if config["use_recurrent"]: new_config[ "network_settings" ].memory = NetworkSettings.MemorySettings( sequence_length=config["sequence_length"], memory_size=config["memory_size"], ) except KeyError: raise TrainerConfigError( "Config doesn't specify use_recurrent. " "Please specify true or false for use_recurrent in your config." ) # Absorb the rest into the base TrainerSettings for key, val in config.items(): if key in attr.fields_dict(TrainerSettings): new_config[key] = val # Structure the whole thing all_behavior_config_dict[behavior_name] = cattr.structure( new_config, TrainerSettings ) return all_behavior_config_dict
def create_bc_module(mock_behavior_specs, bc_settings, use_rnn, tanhresample): # model_path = env.external_brain_names[0] trainer_config = TrainerSettings() trainer_config.network_settings.memory = (NetworkSettings.MemorySettings() if use_rnn else None) policy = TorchPolicy(0, mock_behavior_specs, trainer_config, tanhresample, tanhresample) bc_module = BCModule( policy, settings=bc_settings, policy_learning_rate=trainer_config.hyperparameters.learning_rate, default_batch_size=trainer_config.hyperparameters.batch_size, default_num_epoch=3, ) return bc_module
def create_test_ppo_optimizer(dummy_config, use_rnn, use_discrete, use_visual): mock_specs = mb.setup_test_behavior_specs( use_discrete, use_visual, vector_action_space=DISCRETE_ACTION_SPACE if use_discrete else VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, ) trainer_settings = attr.evolve(dummy_config) trainer_settings.network_settings.memory = (NetworkSettings.MemorySettings( sequence_length=16, memory_size=10) if use_rnn else None) policy = TorchPolicy(0, mock_specs, trainer_settings, "test", False) optimizer = TorchPPOOptimizer(policy, trainer_settings) return optimizer
def __init__(self, specs: BehaviorSpec, settings: GAILSettings) -> None: super().__init__() self._policy_specs = specs self._use_vail = settings.use_vail self._settings = settings state_encoder_settings = NetworkSettings( normalize=False, hidden_units=settings.encoding_size, num_layers=2, vis_encode_type=EncoderType.SIMPLE, memory=None, ) self._state_encoder = NetworkBody(specs.observation_shapes, state_encoder_settings) self._action_flattener = ModelUtils.ActionFlattener(specs) encoder_input_size = settings.encoding_size if settings.use_actions: encoder_input_size += (self._action_flattener.flattened_size + 1 ) # + 1 is for done self.encoder = torch.nn.Sequential( linear_layer(encoder_input_size, settings.encoding_size), Swish(), linear_layer(settings.encoding_size, settings.encoding_size), Swish(), ) estimator_input_size = settings.encoding_size if settings.use_vail: estimator_input_size = self.z_size self._z_sigma = torch.nn.Parameter(torch.ones((self.z_size), dtype=torch.float), requires_grad=True) self._z_mu_layer = linear_layer( settings.encoding_size, self.z_size, kernel_init=Initialization.KaimingHeNormal, kernel_gain=0.1, ) self._beta = torch.nn.Parameter(torch.tensor(self.initial_beta, dtype=torch.float), requires_grad=False) self._estimator = torch.nn.Sequential( linear_layer(estimator_input_size, 1), torch.nn.Sigmoid())
def test_recurrent_ppo(use_discrete): env = MemoryEnvironment([BRAIN_NAME], use_discrete=use_discrete) new_network_settings = attr.evolve( PPO_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=128 ) config = attr.evolve( PPO_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=5000, ) _check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def test_hybrid_recurrent_ppo(): env = MemoryEnvironment([BRAIN_NAME], action_sizes=(1, 1), step_size=0.5) new_network_settings = attr.evolve( PPO_TORCH_CONFIG.network_settings, memory=NetworkSettings.MemorySettings(memory_size=16), ) new_hyperparams = attr.evolve( PPO_TORCH_CONFIG.hyperparameters, learning_rate=1.0e-3, batch_size=64, buffer_size=512, ) config = attr.evolve( PPO_TORCH_CONFIG, hyperparameters=new_hyperparams, network_settings=new_network_settings, max_steps=3000, ) check_environment_trains(env, {BRAIN_NAME: config}, success_threshold=0.9)
def create_sac_optimizer_mock(dummy_config, use_rnn, use_discrete, use_visual): mock_brain = mb.setup_mock_brain( use_discrete, use_visual, vector_action_space=VECTOR_ACTION_SPACE, vector_obs_space=VECTOR_OBS_SPACE, discrete_action_space=DISCRETE_ACTION_SPACE, ) trainer_settings = dummy_config trainer_settings.network_settings.memory = ( NetworkSettings.MemorySettings(sequence_length=16, memory_size=10) if use_rnn else None ) policy = NNPolicy( 0, mock_brain, trainer_settings, False, "test", False, create_tf_graph=False ) optimizer = SACOptimizer(policy, trainer_settings) return optimizer