def test_single_non_terminal_sequence(self): gae = GeneralizedAdvantageEstimation(gae_lambda=self.gae_lambda, discount=self.gamma) test = ComponentTest(component=gae, input_spaces=self.input_spaces) rewards_ = self.rewards.sample(10, fill_value=0.5) baseline_values_ = self.baseline_values.sample(10, fill_value=1.0) terminals_ = self.terminals.sample(size=10, fill_value=False) # Final sequence index must always be true. sequence_indices = [False] * 10 # Assume sequence indices = terminals here. input_ = [baseline_values_, rewards_, terminals_, sequence_indices] advantage_expected = self.gae_helper( baseline=baseline_values_, reward=rewards_, gamma=self.gamma, gae_lambda=self.gae_lambda, terminals=terminals_, sequence_indices=sequence_indices ) advantage = test.test(("calc_gae_values", input_)) recursive_assert_almost_equal(advantage_expected, advantage, decimals=5) print("Expected advantage:", advantage_expected) print("Got advantage:", advantage) test.terminate()
def test_multiple_sequences(self): gae = GeneralizedAdvantageEstimation(gae_lambda=self.gae_lambda, discount=self.gamma) test = ComponentTest(component=gae, input_spaces=self.input_spaces) rewards_ = self.rewards.sample(10, fill_value=0.5) baseline_values_ = self.baseline_values.sample(10, fill_value=1.0) terminals_ = [False] * 10 terminals_[5] = True sequence_indices = [False] * 10 sequence_indices[5] = True terminals_ = np.asarray(terminals_) input_ = [baseline_values_, rewards_, terminals_, sequence_indices] advantage_expected = self.gae_helper( baseline=baseline_values_, reward=rewards_, gamma=self.gamma, gae_lambda=self.gae_lambda, terminals=terminals_, sequence_indices=sequence_indices ) print("Advantage expected:", advantage_expected) advantage = test.test(("calc_gae_values", input_)) print("Got advantage = ", advantage) recursive_assert_almost_equal(advantage_expected, advantage, decimals=5) test.terminate()
def __init__(self, discount=0.99, gae_lambda=1.0, clip_ratio=0.2, standardize_advantages=False, weight_entropy=None, scope="ppo-loss-function", **kwargs): """ Args: discount (float): The discount factor (gamma) to use. gae_lambda (float): Optional GAE discount factor. clip_ratio (float): How much to clip the likelihood ratio between old and new policy when updating. standardize_advantages (bool): If true, normalize advantage values in update. **kwargs: """ self.clip_ratio = clip_ratio self.standardize_advantages = standardize_advantages self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025 super(PPOLossFunction, self).__init__(scope=scope, **kwargs) self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=discount) self.add_components(self.gae_function)
def test_with_manual_numbers_and_lambda_0_5(self): lambda_ = 0.5 lg = lambda_ * self.gamma gae = GeneralizedAdvantageEstimation(gae_lambda=lambda_, discount=self.gamma) test = ComponentTest(component=gae, input_spaces=self.input_spaces) # Batch of 2 sequences. rewards_ = np.array([0.1, 0.2, 0.3]) baseline_values_ = np.array([1.0, 2.0, 3.0]) terminals_ = np.array([False, False, False]) # Final sequence index must always be true. sequence_indices = np.array([False, False, True]) input_ = [baseline_values_, rewards_, terminals_, sequence_indices] # Test TD-error outputs. td = np.array([1.08, 1.17, 0.27]) test.test(("calc_td_errors", input_), expected_outputs=td, decimals=5) expected_gaes_manual = np.array([ td[0] + lg * td[1] + lg * lg * td[2], td[1] + lg * td[2], td[2] ]) expected_gaes_helper = self.gae_helper( baseline_values_, rewards_, self.gamma, lambda_, terminals_, sequence_indices ) recursive_assert_almost_equal(expected_gaes_manual, expected_gaes_helper, decimals=5) advantages = test.test(("calc_gae_values", input_), expected_outputs=expected_gaes_manual) print("Rewards:", rewards_) print("Baseline-values:", baseline_values_) print("Terminals:", terminals_) print("Expected advantage:", expected_gaes_manual) print("Got advantage:", advantages) test.terminate()
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="actor-critic-agent", gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ # Set policy to stochastic. if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(ActorCriticAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True))) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer), \ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = ActorCriticLossFunction( weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [ self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.gae_function ] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False, weight_entropy=None, memory_spec=None, **kwargs): """ Args: gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ # Set policy to stochastic. if "policy_spec" in kwargs: policy_spec = kwargs.pop("policy_spec") policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(ActorCriticAgent, self).__init__( policy_spec=policy_spec, name=kwargs.pop("name", "actor-critic-agent"), **kwargs ) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True) )) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer), \ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = ActorCriticLossFunction(weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.gae_function] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="ppo-agent", clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, value_function_clipping=None, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards` range. value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None, uses simple value function objective. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.standardize_advantages = standardize_advantages self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, value_function_clipping=value_function_clipping, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None, **kwargs): """ Args: clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if "policy_spec" in kwargs: policy_spec = kwargs.pop("policy_spec") policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( policy_spec=policy_spec, # Set policy to stochastic. name=kwargs.pop("name", "ppo-agent"), **kwargs) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, standardize_advantages=standardize_advantages, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
class PPOLossFunction(LossFunction): """ Loss function for proximal policy optimization: https://arxiv.org/abs/1707.06347 """ def __init__(self, discount=0.99, gae_lambda=1.0, clip_ratio=0.2, standardize_advantages=False, weight_entropy=None, scope="ppo-loss-function", **kwargs): """ Args: discount (float): The discount factor (gamma) to use. gae_lambda (float): Optional GAE discount factor. clip_ratio (float): How much to clip the likelihood ratio between old and new policy when updating. standardize_advantages (bool): If true, normalize advantage values in update. **kwargs: """ self.clip_ratio = clip_ratio self.standardize_advantages = standardize_advantages self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025 super(PPOLossFunction, self).__init__(scope=scope, **kwargs) self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=discount) self.add_components(self.gae_function) @rlgraph_api def loss(self, log_probs, baseline_values, actions, rewards, terminals, sequence_indices, logits): """ API-method that calculates the total loss (average over per-batch-item loss) from the original input to per-item-loss. Args: see `self._graph_fn_loss_per_item`. Returns: Total loss, loss per item, total baseline loss, baseline loss per item. """ loss_per_item, baseline_loss_per_item = self.loss_per_item( log_probs, baseline_values, actions, rewards, terminals, sequence_indices, logits ) total_loss = self.loss_average(loss_per_item) total_baseline_loss = self.loss_average(baseline_loss_per_item) return total_loss, loss_per_item, total_baseline_loss, baseline_loss_per_item @rlgraph_api def _graph_fn_loss_per_item(self, log_probs, baseline_values, actions, rewards, terminals, sequence_indices, entropy): """ Args: log_probs (SingleDataOp): Log-likelihoods of actions under policy. actions (SingleDataOp): The batch of actions that were actually taken in states s (from a memory). rewards (SingleDataOp): The batch of rewards that we received after having taken a in s (from a memory). terminals (SingleDataOp): The batch of terminal signals that we received after having taken a in s (from a memory). sequence_indices (DataOp): Int indices denoting sequences (which may be non-terminal episode fragments from multiple environments. entropy (SingleDataOp): Policy entropy. Returns: SingleDataOp: The loss values vector (one single value for each batch item). """ if get_backend() == "tf": # N.b.: Many implementations do the following: # Sample action -> return policy log probs with action -> feed both back in from memory/via placeholders. # This creates the same effect as just stopping the gradients on the log-probs. prev_log_probs = tf.stop_gradient(log_probs) baseline_values = tf.squeeze(input=baseline_values, axis=-1) # Compute advantages. pg_advantages = self.gae_function.calc_gae_values(baseline_values, rewards, terminals, sequence_indices) if self.standardize_advantages: mean, std = tf.nn.moments(x=pg_advantages, axes=[0]) pg_advantages = (pg_advantages - mean) / std v_targets = pg_advantages + baseline_values v_targets = tf.stop_gradient(input=v_targets) # Likelihood ratio and clipped objective. ratio = tf.exp(x=log_probs - prev_log_probs) clipped_advantages = tf.where( condition=pg_advantages > 0, x=(1 + self.clip_ratio) * pg_advantages, y=(1 - self.clip_ratio) * pg_advantages ) loss = -tf.minimum(x=ratio * pg_advantages, y=clipped_advantages) loss += self.weight_entropy * entropy baseline_loss = (v_targets - baseline_values) ** 2 return loss, baseline_loss elif get_backend() == "pytorch": # Detach grads. prev_log_probs = log_probs.detach() baseline_values = torch.squeeze(baseline_values, dim=-1) # Compute advantages. pg_advantages = self.gae_function.calc_gae_values(baseline_values, rewards, terminals, sequence_indices) if self.standardize_advantages: pg_advantages = (pg_advantages - torch.mean(pg_advantages)) / torch.std(pg_advantages) v_targets = pg_advantages + baseline_values v_targets = v_targets.detach() # Likelihood ratio and clipped objective. ratio = torch.exp(log_probs - prev_log_probs) clipped_advantages = torch.where( pg_advantages > 0, (1 + self.clip_ratio) * pg_advantages, (1 - self.clip_ratio) * pg_advantages ) loss = -torch.min(ratio * pg_advantages, clipped_advantages) loss += self.weight_entropy * entropy baseline_loss = (v_targets - baseline_values) ** 2 return loss, baseline_loss