def test_continuous_action(self): action_spec = TensorSpec((4, )) alg = ICMAlgorithm(action_spec=action_spec, observation_spec=self._input_tensor_spec, hidden_size=self._hidden_size) state = self._input_tensor_spec.zeros(outer_dims=(1, )) alg_step = alg.train_step( self._time_step._replace(prev_action=action_spec.zeros( outer_dims=(1, ))), state) # the inverse net should predict a zero action vector self.assertTensorClose( torch.sum(alg_step.info.loss.extra['inverse_loss']), torch.as_tensor(0))
def test_discrete_action(self): action_spec = BoundedTensorSpec((), dtype=torch.int64, minimum=0, maximum=3) alg = ICMAlgorithm(action_spec=action_spec, observation_spec=self._input_tensor_spec, hidden_size=self._hidden_size) state = self._input_tensor_spec.zeros(outer_dims=(1, )) alg_step = alg.train_step( self._time_step._replace(prev_action=action_spec.zeros( outer_dims=(1, ))), state) # the inverse net should predict a uniform distribution self.assertTensorClose( torch.sum(alg_step.info.loss.extra['inverse_loss']), torch.as_tensor( math.log(action_spec.maximum - action_spec.minimum + 1)), epsilon=1e-4)
def test_agent_steps(self): batch_size = 1 observation_spec = TensorSpec((10, )) action_spec = BoundedTensorSpec((), dtype='int64') time_step = TimeStep( observation=observation_spec.zeros(outer_dims=(batch_size, )), prev_action=action_spec.zeros(outer_dims=(batch_size, ))) actor_net = functools.partial(ActorDistributionNetwork, fc_layer_params=(100, )) value_net = functools.partial(ValueNetwork, fc_layer_params=(100, )) # TODO: add a goal generator and an entropy target algorithm once they # are implemented. agent = Agent(observation_spec=observation_spec, action_spec=action_spec, rl_algorithm_cls=functools.partial( ActorCriticAlgorithm, actor_network_ctor=actor_net, value_network_ctor=value_net), intrinsic_reward_module=ICMAlgorithm( action_spec=action_spec, observation_spec=observation_spec)) predict_state = agent.get_initial_predict_state(batch_size) rollout_state = agent.get_initial_rollout_state(batch_size) train_state = agent.get_initial_train_state(batch_size) pred_step = agent.predict_step(time_step, predict_state, epsilon_greedy=0.1) self.assertEqual(pred_step.state.irm, ()) rollout_step = agent.rollout_step(time_step, rollout_state) self.assertNotEqual(rollout_step.state.irm, ()) exp = make_experience(time_step, rollout_step, rollout_state) train_step = agent.train_step(exp, train_state) self.assertNotEqual(train_step.state.irm, ()) self.assertTensorEqual(rollout_step.state.irm, train_step.state.irm)
def create_ac_algorithm(env, actor_fc_layers=(200, 100), value_fc_layers=(200, 100), encoding_conv_layers=(), encoding_fc_layers=(), use_rnns=False, use_icm=False, learning_rate=5e-5, algorithm_class=ActorCriticAlgorithm, loss_class=ActorCriticLoss, debug_summaries=False): """Create a simple ActorCriticAlgorithm. Args: env (TFEnvironment): A TFEnvironment actor_fc_layers (list[int]): list of fc layers parameters for actor network value_fc_layers (list[int]): list of fc layers parameters for value network encoding_conv_layers (list[int]): list of convolution layers parameters for encoding network encoding_fc_layers (list[int]): list of fc layers parameters for encoding network use_rnns (bool): True if rnn should be used use_icm (bool): True if intrinsic curiosity module should be used learning_rate (float): learning rate algorithm_class (type): class of the algorithm. Can be ActorCriticAlgorithm or PPOAlgorithm loss_class (type): the class of the loss. The signature of its constructor: loss_class(action_spec, debug_summaries) debug_summaries (bool): True if debug summaries should be created. """ optimizer = tf.optimizers.Adam(learning_rate=learning_rate) if use_rnns: actor_net = ActorDistributionRnnNetwork( env.observation_spec(), env.action_spec(), input_fc_layer_params=actor_fc_layers, output_fc_layer_params=None) value_net = ValueRnnNetwork(env.observation_spec(), input_fc_layer_params=value_fc_layers, output_fc_layer_params=None) else: actor_net = ActorDistributionNetwork(env.observation_spec(), env.action_spec(), fc_layer_params=actor_fc_layers) value_net = ValueNetwork(env.observation_spec(), fc_layer_params=value_fc_layers) encoding_net = None if encoding_fc_layers or encoding_conv_layers: encoding_net = EncodingNetwork( input_tensor_spec=env.observation_spec(), conv_layer_params=encoding_conv_layers, fc_layer_params=encoding_fc_layers) icm = None if use_icm: feature_spec = env.observation_spec() if encoding_net: feature_spec = tf.TensorSpec((encoding_fc_layers[-1], ), dtype=tf.float32) icm = ICMAlgorithm(env.action_spec(), feature_spec, encoding_net=encoding_net) algorithm = algorithm_class(action_spec=env.action_spec(), actor_network=actor_net, value_network=value_net, intrinsic_curiosity_module=icm, loss_class=loss_class, optimizer=optimizer, debug_summaries=debug_summaries) return algorithm