def test_single_non_terminal_sequence(self):
        gae = GeneralizedAdvantageEstimation(gae_lambda=self.gae_lambda, discount=self.gamma)

        test = ComponentTest(component=gae, input_spaces=self.input_spaces)

        rewards_ = self.rewards.sample(10, fill_value=0.5)
        baseline_values_ = self.baseline_values.sample(10, fill_value=1.0)
        terminals_ = self.terminals.sample(size=10, fill_value=False)

        # Final sequence index must always be true.
        sequence_indices = [False] * 10
        # Assume sequence indices = terminals here.
        input_ = [baseline_values_, rewards_, terminals_, sequence_indices]

        advantage_expected = self.gae_helper(
            baseline=baseline_values_,
            reward=rewards_,
            gamma=self.gamma,
            gae_lambda=self.gae_lambda,
            terminals=terminals_,
            sequence_indices=sequence_indices
        )

        advantage = test.test(("calc_gae_values", input_))
        recursive_assert_almost_equal(advantage_expected, advantage, decimals=5)
        print("Expected advantage:", advantage_expected)
        print("Got advantage:", advantage)

        test.terminate()
    def test_multiple_sequences(self):
        gae = GeneralizedAdvantageEstimation(gae_lambda=self.gae_lambda, discount=self.gamma)

        test = ComponentTest(component=gae, input_spaces=self.input_spaces)

        rewards_ = self.rewards.sample(10, fill_value=0.5)
        baseline_values_ = self.baseline_values.sample(10, fill_value=1.0)
        terminals_ = [False] * 10
        terminals_[5] = True
        sequence_indices = [False] * 10
        sequence_indices[5] = True
        terminals_ = np.asarray(terminals_)

        input_ = [baseline_values_, rewards_, terminals_, sequence_indices]
        advantage_expected = self.gae_helper(
            baseline=baseline_values_,
            reward=rewards_,
            gamma=self.gamma,
            gae_lambda=self.gae_lambda,
            terminals=terminals_,
            sequence_indices=sequence_indices
        )

        print("Advantage expected:", advantage_expected)
        advantage = test.test(("calc_gae_values", input_))
        print("Got advantage = ", advantage)
        recursive_assert_almost_equal(advantage_expected, advantage, decimals=5)

        test.terminate()
    def __init__(self, discount=0.99, gae_lambda=1.0, clip_ratio=0.2, standardize_advantages=False, weight_entropy=None,
                 scope="ppo-loss-function", **kwargs):
        """
        Args:
            discount (float): The discount factor (gamma) to use.
            gae_lambda (float): Optional GAE discount factor.
            clip_ratio (float): How much to clip the likelihood ratio between old and new policy when updating.
            standardize_advantages (bool): If true, normalize advantage values in update.
            **kwargs:
        """
        self.clip_ratio = clip_ratio
        self.standardize_advantages = standardize_advantages
        self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025

        super(PPOLossFunction, self).__init__(scope=scope, **kwargs)

        self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=discount)
        self.add_components(self.gae_function)
Exemple #4
0
    def test_with_manual_numbers_and_lambda_0_5(self):
        lambda_ = 0.5
        lg = lambda_ * self.gamma
        gae = GeneralizedAdvantageEstimation(gae_lambda=lambda_, discount=self.gamma)

        test = ComponentTest(component=gae, input_spaces=self.input_spaces)

        # Batch of 2 sequences.
        rewards_ = np.array([0.1, 0.2, 0.3])
        baseline_values_ = np.array([1.0, 2.0, 3.0])
        terminals_ = np.array([False, False, False])

        # Final sequence index must always be true.
        sequence_indices = np.array([False, False, True])
        input_ = [baseline_values_, rewards_, terminals_, sequence_indices]

        # Test TD-error outputs.
        td = np.array([1.08, 1.17, 0.27])
        test.test(("calc_td_errors", input_), expected_outputs=td, decimals=5)

        expected_gaes_manual = np.array([
            td[0] + lg * td[1] + lg * lg * td[2],
            td[1] + lg * td[2],
            td[2]
        ])
        expected_gaes_helper = self.gae_helper(
            baseline_values_, rewards_, self.gamma, lambda_, terminals_, sequence_indices
        )
        recursive_assert_almost_equal(expected_gaes_manual, expected_gaes_helper, decimals=5)
        advantages = test.test(("calc_gae_values", input_), expected_outputs=expected_gaes_manual)

        print("Rewards:", rewards_)
        print("Baseline-values:", baseline_values_)
        print("Terminals:", terminals_)
        print("Expected advantage:", expected_gaes_manual)
        print("Got advantage:", advantages)

        test.terminate()
Exemple #5
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="actor-critic-agent",
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 sample_episodes=False,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.
            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.
            discount (float): The discount factor (gamma).
            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.
            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.
            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.
            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.
            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.
            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).
            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.
            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.
            name (str): Some name for this Agent object.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:{}".format(self.policy.scope),
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True)))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(
            weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.gae_function
        ]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component],
                              self.input_spaces,
                              optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True
    def __init__(self, gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False,
                 weight_entropy=None, memory_spec=None, **kwargs):
        """
        Args:
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            sample_episodes (bool): If true, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.
            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).
            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
            a ring-buffer.
        """
        # Set policy to stochastic.
        if "policy_spec" in kwargs:
            policy_spec = kwargs.pop("policy_spec")
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(ActorCriticAgent, self).__init__(
            policy_spec=policy_spec,
            name=kwargs.pop("name", "actor-critic-agent"), **kwargs
        )
        self.sample_episodes = sample_episodes

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank()
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(dict(
            actions=self.action_space.with_batch_rank(),
            policy_weights="variables:{}".format(self.policy.scope),
            deterministic=bool,
            preprocessed_states=preprocessed_state_space,
            rewards=reward_space,
            terminals=terminal_space,
            sequence_indices=BoolBox(add_batch_rank=True)
        ))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards", "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(self.memory, RingBuffer), \
            "ERROR: Actor-critic memory must be ring-buffer for episode-handling."
        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals")

        self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=self.discount,
                                                           clip_rewards=clip_rewards)
        self.loss_function = ActorCriticLossFunction(weight_entropy=weight_entropy)

        # Add all our sub-components to the core.
        sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy,
                          self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer,
                          self.gae_function]
        self.root_component.add_components(*sub_components)

        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer,
                              batch_size=self.update_spec["batch_size"],
                              build_options=self.build_options)

            self.graph_built = True
Exemple #7
0
    def __init__(self,
                 state_space,
                 action_space,
                 discount=0.98,
                 preprocessing_spec=None,
                 network_spec=None,
                 internal_states_space=None,
                 policy_spec=None,
                 value_function_spec=None,
                 execution_spec=None,
                 optimizer_spec=None,
                 value_function_optimizer_spec=None,
                 observe_spec=None,
                 update_spec=None,
                 summary_spec=None,
                 saver_spec=None,
                 auto_build=True,
                 name="ppo-agent",
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 value_function_clipping=None,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None):
        """
        Args:
            state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object.
            action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object.

            preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states
                preprocessing steps or a PreprocessorStack object itself.

            discount (float): The discount factor (gamma).

            network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork
                object itself.

            internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct
                Space object for the Space(s) of the internal (RNN) states.

            policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor.

            value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance
                of ValueFunction.

            execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings.
            optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent.

            value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer
                spec for the policy is used (same learning rate and optimizer type).

            observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings.
            update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings.
            summary_spec (Optional[dict]): Spec-dict to specify summary settings.
            saver_spec (Optional[dict]): Spec-dict to specify saver settings.

            auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's
                graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing
                components before building.

            name (str): Some name for this Agent object.
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.

            clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards`
                range.

            value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None,
                uses simple value function objective.

            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if policy_spec is not None:
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            state_space=state_space,
            action_space=action_space,
            discount=discount,
            preprocessing_spec=preprocessing_spec,
            network_spec=network_spec,
            internal_states_space=internal_states_space,
            policy_spec=policy_spec,
            value_function_spec=value_function_spec,
            execution_spec=execution_spec,
            optimizer_spec=optimizer_spec,
            value_function_optimizer_spec=value_function_optimizer_spec,
            observe_spec=observe_spec,
            update_spec=update_spec,
            summary_spec=summary_spec,
            saver_spec=saver_spec,
            name=name,
            auto_build=auto_build)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
            format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.standardize_advantages = standardize_advantages
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            value_function_clipping=value_function_clipping,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.policy,
            self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True
Exemple #8
0
    def __init__(self,
                 clip_ratio=0.2,
                 gae_lambda=1.0,
                 clip_rewards=0.0,
                 standardize_advantages=False,
                 sample_episodes=True,
                 weight_entropy=None,
                 memory_spec=None,
                 **kwargs):
        """
        Args:
            clip_ratio (float): Clipping parameter for likelihood ratio.
            gae_lambda (float): Lambda for generalized advantage estimation.
            clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range.
            standardize_advantages (bool): If true, standardize advantage values in update.

            sample_episodes (bool): If True, the update method interprets the batch_size as the number of
                episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This
                is especially relevant for environments where episode lengths may vastly differ throughout training. For
                example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps.

            weight_entropy (float): The coefficient used for the entropy regularization term (L[E]).

            memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be
                a ring-buffer.
        """
        if "policy_spec" in kwargs:
            policy_spec = kwargs.pop("policy_spec")
            policy_spec["deterministic"] = False
        else:
            policy_spec = dict(deterministic=False)
        super(PPOAgent, self).__init__(
            policy_spec=policy_spec,  # Set policy to stochastic.
            name=kwargs.pop("name", "ppo-agent"),
            **kwargs)
        self.sample_episodes = sample_episodes

        # TODO: Have to manually set it here for multi-GPU synchronizer to know its number
        # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch.
        # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4

        # Extend input Space definitions to this Agent's specific API-methods.
        preprocessed_state_space = self.preprocessed_state_space.with_batch_rank(
        )
        reward_space = FloatBox(add_batch_rank=True)
        terminal_space = BoolBox(add_batch_rank=True)

        self.input_spaces.update(
            dict(actions=self.action_space.with_batch_rank(),
                 policy_weights="variables:policy",
                 value_function_weights="variables:value-function",
                 deterministic=bool,
                 preprocessed_states=preprocessed_state_space,
                 rewards=reward_space,
                 terminals=terminal_space,
                 sequence_indices=BoolBox(add_batch_rank=True),
                 apply_postprocessing=bool))

        # The merger to merge inputs into one record Dict going into the memory.
        self.merger = ContainerMerger("states", "actions", "rewards",
                                      "terminals")
        self.memory = Memory.from_spec(memory_spec)
        assert isinstance(
            self.memory, RingBuffer
        ), "ERROR: PPO memory must be ring-buffer for episode-handling!"

        # Make sure the python buffer is not larger than our memory capacity.
        assert self.observe_spec["buffer_size"] <= self.memory.capacity, \
            "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \
                format(self.observe_spec["buffer_size"], self.memory.capacity)

        # The splitter for splitting up the records coming from the memory.
        self.splitter = ContainerSplitter("states", "actions", "rewards",
                                          "terminals")
        self.gae_function = GeneralizedAdvantageEstimation(
            gae_lambda=gae_lambda,
            discount=self.discount,
            clip_rewards=clip_rewards)
        self.loss_function = PPOLossFunction(
            clip_ratio=clip_ratio,
            standardize_advantages=standardize_advantages,
            weight_entropy=weight_entropy)

        self.iterations = self.update_spec["num_iterations"]
        self.sample_size = self.update_spec["sample_size"]
        self.batch_size = self.update_spec["batch_size"]

        # Add all our sub-components to the core.
        self.root_component.add_components(
            self.preprocessor, self.merger, self.memory, self.splitter,
            self.policy, self.exploration, self.loss_function, self.optimizer,
            self.value_function, self.value_function_optimizer,
            self.vars_merger, self.vars_splitter, self.gae_function)
        # Define the Agent's (root-Component's) API.
        self.define_graph_api()
        self.build_options = dict(vf_optimizer=self.value_function_optimizer)

        if self.auto_build:
            self._build_graph(
                [self.root_component],
                self.input_spaces,
                optimizer=self.optimizer,
                # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get
                # multi-gpu-split.
                batch_size=self.update_spec["sample_size"],
                build_options=self.build_options)
            self.graph_built = True
class PPOLossFunction(LossFunction):
    """
    Loss function for proximal policy optimization:

    https://arxiv.org/abs/1707.06347
    """
    def __init__(self, discount=0.99, gae_lambda=1.0, clip_ratio=0.2, standardize_advantages=False, weight_entropy=None,
                 scope="ppo-loss-function", **kwargs):
        """
        Args:
            discount (float): The discount factor (gamma) to use.
            gae_lambda (float): Optional GAE discount factor.
            clip_ratio (float): How much to clip the likelihood ratio between old and new policy when updating.
            standardize_advantages (bool): If true, normalize advantage values in update.
            **kwargs:
        """
        self.clip_ratio = clip_ratio
        self.standardize_advantages = standardize_advantages
        self.weight_entropy = weight_entropy if weight_entropy is not None else 0.00025

        super(PPOLossFunction, self).__init__(scope=scope, **kwargs)

        self.gae_function = GeneralizedAdvantageEstimation(gae_lambda=gae_lambda, discount=discount)
        self.add_components(self.gae_function)

    @rlgraph_api
    def loss(self, log_probs, baseline_values, actions, rewards, terminals, sequence_indices, logits):
        """
        API-method that calculates the total loss (average over per-batch-item loss) from the original input to
        per-item-loss.

        Args: see `self._graph_fn_loss_per_item`.

        Returns:
            Total loss, loss per item, total baseline loss, baseline loss per item.
        """
        loss_per_item, baseline_loss_per_item = self.loss_per_item(
            log_probs, baseline_values, actions, rewards, terminals, sequence_indices, logits
        )
        total_loss = self.loss_average(loss_per_item)
        total_baseline_loss = self.loss_average(baseline_loss_per_item)

        return total_loss, loss_per_item, total_baseline_loss, baseline_loss_per_item

    @rlgraph_api
    def _graph_fn_loss_per_item(self, log_probs, baseline_values, actions, rewards, terminals,
                                sequence_indices, entropy):
        """
        Args:
            log_probs (SingleDataOp): Log-likelihoods of actions under policy.
            actions (SingleDataOp): The batch of actions that were actually taken in states s (from a memory).
            rewards (SingleDataOp): The batch of rewards that we received after having taken a in s (from a memory).

            terminals (SingleDataOp): The batch of terminal signals that we received after having taken a in s
                (from a memory).

            sequence_indices (DataOp): Int indices denoting sequences (which may be non-terminal episode fragments
                from multiple environments.

            entropy (SingleDataOp): Policy entropy.

        Returns:
            SingleDataOp: The loss values vector (one single value for each batch item).
        """
        if get_backend() == "tf":
            # N.b.: Many implementations do the following:
            # Sample action -> return policy log probs with action -> feed both back in from memory/via placeholders.
            # This creates the same effect as just stopping the gradients on the log-probs.
            prev_log_probs = tf.stop_gradient(log_probs)
            baseline_values = tf.squeeze(input=baseline_values, axis=-1)

            # Compute advantages.
            pg_advantages = self.gae_function.calc_gae_values(baseline_values, rewards, terminals, sequence_indices)

            if self.standardize_advantages:
                mean, std = tf.nn.moments(x=pg_advantages, axes=[0])
                pg_advantages = (pg_advantages - mean) / std

            v_targets = pg_advantages + baseline_values
            v_targets = tf.stop_gradient(input=v_targets)

            # Likelihood ratio and clipped objective.
            ratio = tf.exp(x=log_probs - prev_log_probs)
            clipped_advantages = tf.where(
                condition=pg_advantages > 0,
                x=(1 + self.clip_ratio) * pg_advantages,
                y=(1 - self.clip_ratio) * pg_advantages
            )

            loss = -tf.minimum(x=ratio * pg_advantages, y=clipped_advantages)
            loss += self.weight_entropy * entropy

            baseline_loss = (v_targets - baseline_values) ** 2

            return loss, baseline_loss

        elif get_backend() == "pytorch":
            # Detach grads.
            prev_log_probs = log_probs.detach()
            baseline_values = torch.squeeze(baseline_values, dim=-1)

            # Compute advantages.
            pg_advantages = self.gae_function.calc_gae_values(baseline_values, rewards, terminals, sequence_indices)

            if self.standardize_advantages:
                pg_advantages = (pg_advantages - torch.mean(pg_advantages)) / torch.std(pg_advantages)

            v_targets = pg_advantages + baseline_values
            v_targets = v_targets.detach()

            # Likelihood ratio and clipped objective.
            ratio = torch.exp(log_probs - prev_log_probs)
            clipped_advantages = torch.where(
                pg_advantages > 0,
                (1 + self.clip_ratio) * pg_advantages,
                (1 - self.clip_ratio) * pg_advantages
            )

            loss = -torch.min(ratio * pg_advantages, clipped_advantages)
            loss += self.weight_entropy * entropy

            baseline_loss = (v_targets - baseline_values) ** 2
            return loss, baseline_loss