def __init__(self, clip_ratio, memory_spec=None, **kwargs): """ Args: memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm. """ super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"), **kwargs) self.train_time_steps = 0 # PPO uses a ring buffer. self.memory = Memory.from_spec(memory_spec) self.record_space = Dict(states=self.state_space, actions=self.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=False) self.policy = Policy(network_spec=self.neural_network, action_adapter_spec=None) self.merger = DictMerger(output_space=self.record_space) splitter_input_space = copy.deepcopy(self.record_space) self.splitter = ContainerSplitter(input_space=splitter_input_space) self.loss_function = PPOLossFunction(clip_ratio=clip_ratio, discount=self.discount) self.define_graph_api() if self.auto_build: self._build_graph() self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="ppo-agent", clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, value_function_clipping=None, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards` range. value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None, uses simple value function objective. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.standardize_advantages = standardize_advantages self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, value_function_clipping=value_function_clipping, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None, **kwargs): """ Args: clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if "policy_spec" in kwargs: policy_spec = kwargs.pop("policy_spec") policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( policy_spec=policy_spec, # Set policy to stochastic. name=kwargs.pop("name", "ppo-agent"), **kwargs) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, standardize_advantages=standardize_advantages, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True