def _compute_loss(self, observations, actions, rewards, observations_n, dones): '''Compute the loss of the agents.''' feed_dict = self.placeholders._make( [observations, actions, rewards, observations_n, dones]) feed_dict = flatten_map(feed_dict._asdict()) return unflatten_map(self._compute_loss_h(feed_dict))
def __init__(self, observation_space, action_space, shared_policy=False, shared_critic=False, normalize=None): super().__init__(observation_space, action_space) observation_spaces = self.observation_space.spaces action_spaces = self.action_space.spaces self.policy = SharedOptions(shared_policy, shared_critic) with self.graph.as_default(): maddpg_module = MaTD3Module(observation_spaces, action_spaces, shared_policy, shared_critic, normalize=normalize) functions = maddpg_module(*self.placeholders) self._predicts_h = TfFunction(self.session, inputs=self.placeholders.observations, outputs=functions.actor_predict) self._compute_values_h = TfFunction( self.session, inputs=self.placeholders.observations, outputs=functions.critic_predict) updates = [functions.actor_update, functions.critic_update] self._update_targets = TfFunction(self.session, updates=updates) optimizers = [functions.actor_optimizer, functions.critic_optimizer] inputs = flatten_map(self.placeholders._asdict()) outputs = flatten_map({ 'actor': functions.actor_losses, 'critic': functions.critic_losses }) self._train = TfFunction(self.session, inputs=inputs, updates=optimizers, outputs=outputs) self._train_critic = TfFunction(self.session, inputs=inputs, updates=[functions.critic_optimizer], outputs=functions.critic_losses) self._compute_loss_h = TfFunction(self.session, inputs=inputs, outputs=outputs)
def __init__(self, observation_space, action_space, shared_policy=False, shared_critic=False, hyperparameters=None): super().__init__(observation_space, action_space) hyperparameters = {} if hyperparameters else hyperparameters observation_spaces = self.observation_space.spaces action_spaces = self.action_space.spaces self.policy = SharedOptions(shared_policy, shared_critic) with self.graph.as_default(): maddpg_module = MaddpgModule( observation_spaces, action_spaces, shared_policy, shared_critic, hyperparameters=hyperparameters.get('agent_parameters')) functions = maddpg_module(*self.placeholders, gamma=hyperparameters.get('gamma', 0.95)) self._predicts_h = TfFunction(self.session, inputs=self.placeholders.observations, outputs=functions.actor_predict) self._compute_values_h = TfFunction( self.session, inputs=self.placeholders.observations, outputs=functions.critic_predict) updates = [functions.actor_update, functions.critic_update] self._update_targets = TfFunction(self.session, updates=updates) optimizers = [functions.actor_optimizer, functions.critic_optimizer] inputs = flatten_map(self.placeholders._asdict()) outputs = flatten_map({ 'actor': functions.actor_losses, 'critic': functions.critic_losses }) self._train = TfFunction(self.session, inputs=inputs, updates=optimizers, outputs=outputs) self._compute_loss_h = TfFunction(self.session, inputs=inputs, outputs=outputs)
def _train_step(self, observations, actions, rewards, observations_n, dones, step=None): '''Train the agents.''' feed_dict = self.placeholders._make( [observations, actions, rewards, observations_n, dones]) feed_dict = flatten_map(feed_dict._asdict()) return unflatten_map(self._train(feed_dict))
def _train_step(self, observations, actions, rewards, observations_n, dones, step=None): '''Train the agents.''' feed_dict = self.placeholders._make( [observations, actions, rewards, observations_n, dones]) feed_dict = flatten_map(feed_dict._asdict()) if step and step % 2 == 0: results = unflatten_map(self._train(feed_dict)) else: results = unflatten_map(self._train_critic(feed_dict)) return results