def __init__(self, clip_ratio, memory_spec=None, **kwargs): """ Args: memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm. """ super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"), **kwargs) self.train_time_steps = 0 # PPO uses a ring buffer. self.memory = Memory.from_spec(memory_spec) self.record_space = Dict(states=self.state_space, actions=self.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=False) self.policy = Policy(network_spec=self.neural_network, action_adapter_spec=None) self.merger = DictMerger(output_space=self.record_space) splitter_input_space = copy.deepcopy(self.record_space) self.splitter = ContainerSplitter(input_space=splitter_input_space) self.loss_function = PPOLossFunction(clip_ratio=clip_ratio, discount=self.discount) self.define_graph_api() if self.auto_build: self._build_graph() self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="sac-agent", double_q=True, initial_alpha=1.0, gumbel_softmax_temperature=1.0, target_entropy=None, memory_spec=None, value_function_sync_spec=None): """ This is an implementation of the Soft-Actor Critic algorithm. Paper: http://arxiv.org/abs/1801.01290 Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. double_q (bool): Whether to train two q networks independently. initial_alpha (float): "The temperature parameter α determines the relative importance of the entropy term against the reward". gumbel_softmax_temperature (float): Temperature parameter for the Gumbel-Softmax distribution used for discrete actions. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. update_spec (dict): Here we can have sync_interval or sync_tau (for the value network update). """ # If VF spec is a network spec, wrap with SAC vf type. The VF must concatenate actions and states, # which can require splitting the network in the case of e.g. conv-inputs. if isinstance(value_function_spec, list): value_function_spec = dict(type="sac_value_function", network_spec=value_function_spec) self.logger.info("Using default SAC value function.") elif isinstance(value_function_spec, ValueFunction): self.logger.info( "Using value function object {}".format(ValueFunction)) if policy_spec is None: # Continuous action space: Use squashed normal. # Discrete: Gumbel-softmax. policy_spec = dict( deterministic=False, distributions_spec=dict( bounded_distribution_type="squashed", discrete_distribution_type="gumbel_softmax", gumbel_softmax_temperature=gumbel_softmax_temperature)) super(SACAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, auto_build=auto_build, name=name) self.double_q = double_q self.target_entropy = target_entropy self.initial_alpha = initial_alpha # Assert that the synch interval is a multiple of the update_interval. if "sync_interval" in self.update_spec: if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) elif "sync_tau" in self.update_spec: if self.update_spec["sync_tau"] <= 0 or self.update_spec[ "sync_tau"] > 1.0: raise RLGraphError( "sync_tau ({}) must be in interval (0.0, 1.0]!".format( self.update_spec["sync_tau"])) else: self.update_spec[ "sync_tau"] = 0.005 # The value mentioned in the paper # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) #self.iterations = self.update_spec["num_iterations"] self.batch_size = self.update_spec["batch_size"] float_action_space = self.action_space.with_batch_rank().map( mapping=lambda flat_key, space: space.as_one_hot_float_space() if isinstance(space, IntBox) else space) self.input_spaces.update( dict(env_actions=self.action_space.with_batch_rank(), actions=float_action_space, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, states=self.state_space.with_batch_rank(add_batch_rank=True), batch_size=int, importance_weights=FloatBox(add_batch_rank=True), deterministic=bool, weights="variables:{}".format(self.policy.scope))) if value_function_sync_spec is None: value_function_sync_spec = SyncSpecification( sync_interval=self.update_spec["sync_interval"] // self.update_spec["update_interval"], sync_tau=self.update_spec["sync_tau"] if "sync_tau" in self.update_spec else 5e-3) self.memory = Memory.from_spec(memory_spec) self.alpha_optimizer = self.optimizer.copy( scope="alpha-" + self.optimizer.scope) if self.target_entropy is not None else None self.root_component = SACAgentComponent( agent=self, policy=self.policy, q_function=self.value_function, preprocessor=self.preprocessor, memory=self.memory, discount=self.discount, initial_alpha=self.initial_alpha, target_entropy=target_entropy, optimizer=self.optimizer, vf_optimizer=self.value_function_optimizer, alpha_optimizer=self.alpha_optimizer, q_sync_spec=value_function_sync_spec, num_q_functions=2 if self.double_q is True else 1) extra_optimizers = [self.value_function_optimizer] if self.alpha_optimizer is not None: extra_optimizers.append(self.alpha_optimizer) self.build_options = dict(optimizers=extra_optimizers) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="actor-critic-agent", gae_lambda=1.0, clip_rewards=0.0, sample_episodes=False, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ # Set policy to stochastic. if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(ActorCriticAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True))) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer), \ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = ActorCriticLossFunction( weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [ self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.gae_function ] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, demo_memory_spec=None, demo_sample_ratio=0.2, store_last_memory_batch=False, store_last_q_table=False, **kwargs): # TODO Most of this is DQN duplicate but the way the loss function is instantiated, inheriting # from DQN does not work well. """ Args: expert_margin (float): The expert margin enforces a distance in Q-values between expert action and all other actions. supervised_weight (float): Indicates weight of the expert loss. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use. demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. policy_spec = kwargs.pop("policy_spec", dict()) # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQFDAgent, self).__init__( policy_spec=policy_spec, name=kwargs.pop("name", "dqfd-agent"), **kwargs ) # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"]) ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.demo_batch_size = int(demo_sample_ratio * self.update_spec['batch_size'] / (1.0 - demo_sample_ratio)) self.shared_container_action_target = shared_container_action_target # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, demo_batch_size=int, apply_demo_loss=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space )) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # Cannot have same default name. demo_memory_spec["scope"] = "demo-memory" self.demo_memory = Memory.from_spec(demo_memory_spec) # The splitter for splitting up the records from the memories. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQFDLossFunction( expert_margin=expert_margin, supervised_weight=supervised_weight, discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer ) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, memory_spec=None, store_last_memory_batch=False, store_last_q_table=False, **kwargs): """ Args: double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. action_adapter_spec = kwargs.pop("action_adapter_spec", dict()) # Use a DuelingActionAdapter (instead of a basic ActionAdapter) if option is set. if dueling_q is True: action_adapter_spec["type"] = "dueling-action-adapter" assert "units_state_value_stream" in action_adapter_spec assert "units_advantage_stream" in action_adapter_spec super(DQNAgent, self).__init__( action_adapter_spec=action_adapter_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), weights="variables:policy", time_step=int, use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, # TODO: This is currently necessary for multi-GPU handling (as the update_from_external_batch # TODO: gets overridden by a generic function with args=*inputs) #inputs=[preprocessed_state_space, self.action_space.with_batch_rank(), reward_space, terminal_space, # preprocessed_state_space, weight_space] )) # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) self.target_policy.add_components(Synchronizable(), expose_apis="sync") # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, importance_weights=use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. sub_components = [self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api("policy", "preprocessor-stack", self.optimizer.scope, *sub_components) # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="ppo-agent", clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, value_function_clipping=None, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list, dict, ValueFunction): Neural network specification for baseline or instance of ValueFunction. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function optimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clipping value. If not 0, rewards will be clipped within a +/- `clip_rewards` range. value_function_clipping (Optional[float]): If not None, uses clipped value function objective. If None, uses simple value function objective. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if policy_spec is not None: policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, value_function_spec=value_function_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, value_function_optimizer_spec=value_function_optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, name=name, auto_build=auto_build) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.standardize_advantages = standardize_advantages self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, value_function_clipping=value_function_clipping, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, gae_lambda=1.0, sample_episodes=False, weight_entropy=None, memory_spec=None, **kwargs): """ Args: gae_lambda (float): Lambda for generalized advantage estimation. sample_episodes (bool): If true, the update method interprets the batch_size as the number of episodes to fetch from the memory. If false, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ super(ActorCriticAgent, self).__init__( policy_spec=dict(deterministic=False), # Set policy to stochastic. name=kwargs.pop("name", "actor-critic-agent"), **kwargs) self.sample_episodes = sample_episodes # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True))) # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance(self.memory, RingBuffer),\ "ERROR: Actor-critic memory must be ring-buffer for episode-handling." # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.loss_function = ActorCriticLossFunction( discount=self.discount, gae_lambda=gae_lambda, weight_entropy=weight_entropy) # Add all our sub-components to the core. sub_components = [ self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer ] self.root_component.add_components(*sub_components) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=dict(vf_optimizer=self.value_function_optimizer)) self.graph_built = True
def __init__(self, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, store_last_memory_batch=False, store_last_q_table=False, **kwargs): """ Args: double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. store_last_memory_batch (bool): Whether to store the last pulled batch from the memory in `self.last_memory_batch` for debugging purposes. Default: False. store_last_q_table (bool): Whether to store the Q(s,a) values for the last received batch (memory or external) in `self.last_q_table` for debugging purposes. Default: False. """ # Fix action-adapter before passing it to the super constructor. policy_spec = kwargs.pop("policy_spec", dict()) # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQNAgent, self).__init__(policy_spec=policy_spec, name=kwargs.pop("name", "dqn-agent"), **kwargs) # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. #self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.shared_container_action_target = shared_container_action_target # Debugging tools. self.store_last_memory_batch = store_last_memory_batch self.last_memory_batch = None self.store_last_q_table = store_last_q_table self.last_q_table = None # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update( dict( actions=self.action_space.with_batch_rank(), # weights will have a Space derived from the vars of policy. policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, )) if self.value_function is not None: self.input_spaces[ "value_function_weights"] = "variables:{}".format( self.value_function.scope), # The merger to merge inputs into one record Dict going into the memory. self.merger = DictMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity,\ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\ format(self.observe_spec["buffer_size"], self.memory.capacity) # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step) self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.target_policy, self.value_function, self.value_function_optimizer, # <- should both be None for DQN self.exploration, self.loss_function, self.optimizer, self.vars_merger, self.vars_splitter) # Define the Agent's (root-Component's) API. self.define_graph_api() # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="dqfd-agent", expert_margin=0.5, supervised_weight=1.0, double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=False, memory_spec=None, demo_memory_spec=None, demo_sample_ratio=0.2, ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. expert_margin (float): The expert margin enforces a distance in Q-values between expert action and all other actions. supervised_weight (float): Indicates weight of the expert loss. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use. demo_memory_spec (Optional[dict,Memory]): The spec for the Demo-Memory to use. """ # Fix action-adapter before passing it to the super constructor. # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: if policy_spec is None: policy_spec = {} policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQFDAgent, self).__init__( state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, exploration_spec=exploration_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, auto_build=auto_build, name=name ) # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"]) ) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.expert_margin = expert_margin self.batch_size = self.update_spec["batch_size"] self.default_margins = np.asarray([self.expert_margin] * self.batch_size) self.demo_batch_size = int(demo_sample_ratio * self.update_spec["batch_size"] / (1.0 - demo_sample_ratio)) self.demo_margins = np.asarray([self.expert_margin] * self.demo_batch_size) self.shared_container_action_target = shared_container_action_target # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank() reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update(dict( actions=self.action_space.with_batch_rank(), policy_weights="variables:{}".format(self.policy.scope), time_step=int, use_exploration=bool, demo_batch_size=int, apply_demo_loss=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, expert_margins=FloatBox(add_batch_rank=True), next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space )) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "next_states", "terminals") # The replay memory. self.memory = Memory.from_spec(memory_spec) # Cannot have same default name. demo_memory_spec["scope"] = "demo-memory" self.demo_memory = Memory.from_spec(demo_memory_spec) # The splitter for splitting up the records from the memories. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 self.use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQFDLossFunction( supervised_weight=supervised_weight, discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=self.use_importance_weights, n_step=n_step ) # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.demo_memory, self.splitter, self.policy, self.target_policy, self.exploration, self.loss_function, self.optimizer ) # Define the Agent's (root-Component's) API. self.define_graph_api() if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True
def __init__(self, double_q=True, initial_alpha=1.0, gumbel_softmax_temperature=1.0, target_entropy=None, memory_spec=None, value_function_sync_spec=None, **kwargs): """ This is an implementation of the Soft-Actor Critic algorithm. Paper: http://arxiv.org/abs/1801.01290 Args: double_q (bool): Whether to train two q networks independently. initial_alpha (float): "The temperature parameter α determines the relative importance of the entropy term against the reward". gumbel_softmax_temperature (float): Temperature parameter for the Gumbel-Softmax distribution used for discrete actions. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. update_spec (dict): Here we can have sync_interval or sync_tau (for the value network update). """ value_function_spec = kwargs.pop("value_function_spec") value_function_spec = dict(type="sac_value_function", network_spec=value_function_spec) super(SACAgent, self).__init__( # Continuous action space: Use squashed normal. # Discrete: Gumbel-softmax. policy_spec=dict( deterministic=False, distributions_spec=dict( bounded_distribution_type="squashed", discrete_distribution_type="gumbel_softmax", gumbel_softmax_temperature=gumbel_softmax_temperature)), name=kwargs.pop("name", "sac-agent"), value_function_spec=value_function_spec, **kwargs) self.double_q = double_q self.target_entropy = target_entropy self.initial_alpha = initial_alpha # Assert that the synch interval is a multiple of the update_interval. if "sync_interval" in self.update_spec: if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) elif "sync_tau" in self.update_spec: if self.update_spec["sync_tau"] <= 0 or self.update_spec[ "sync_tau"] > 1.0: raise RLGraphError( "sync_tau ({}) must be in interval (0.0, 1.0]!".format( self.update_spec["sync_tau"])) else: self.update_spec[ "sync_tau"] = 0.005 # The value mentioned in the paper # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.iterations = self.update_spec["num_iterations"] self.batch_size = self.update_spec["batch_size"] float_action_space = self.action_space.with_batch_rank() if isinstance(self.action_space, Dict): for name, space in float_action_space.flatten( scope_separator_at_start=False).items(): if isinstance(space, IntBox): float_action_space[name] = space.as_one_hot_float_space() elif isinstance(self.action_space, IntBox): float_action_space = float_action_space.as_one_hot_float_space() self.input_spaces.update( dict(env_actions=self.action_space.with_batch_rank(), actions=float_action_space, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, states=self.state_space.with_batch_rank(add_batch_rank=True), batch_size=int, importance_weights=FloatBox(add_batch_rank=True), deterministic=bool, weights="variables:{}".format(self.policy.scope))) if value_function_sync_spec is None: value_function_sync_spec = SyncSpecification( sync_interval=self.update_spec["sync_interval"] // self.update_spec["update_interval"], sync_tau=self.update_spec["sync_tau"] if "sync_tau" in self.update_spec else 5e-3) self.memory = Memory.from_spec(memory_spec) self.alpha_optimizer = self.optimizer.copy( scope="alpha-" + self.optimizer.scope) if self.target_entropy is not None else None self.root_component = SACAgentComponent( agent=self, policy=self.policy, q_function=self.value_function, preprocessor=self.preprocessor, memory=self.memory, discount=self.discount, initial_alpha=self.initial_alpha, target_entropy=target_entropy, optimizer=self.optimizer, vf_optimizer=self.value_function_optimizer, alpha_optimizer=self.alpha_optimizer, q_sync_spec=value_function_sync_spec, num_q_functions=2 if self.double_q is True else 1) extra_optimizers = [self.value_function_optimizer] if self.alpha_optimizer is not None: extra_optimizers.append(self.alpha_optimizer) self.build_options = dict(optimizers=extra_optimizers) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"], build_options=self.build_options) self.graph_built = True
def __init__(self, clip_ratio=0.2, gae_lambda=1.0, clip_rewards=0.0, standardize_advantages=False, sample_episodes=True, weight_entropy=None, memory_spec=None, **kwargs): """ Args: clip_ratio (float): Clipping parameter for likelihood ratio. gae_lambda (float): Lambda for generalized advantage estimation. clip_rewards (float): Reward clip value. If not 0, rewards will be clipped into this range. standardize_advantages (bool): If true, standardize advantage values in update. sample_episodes (bool): If True, the update method interprets the batch_size as the number of episodes to fetch from the memory. If False, batch_size will refer to the number of time-steps. This is especially relevant for environments where episode lengths may vastly differ throughout training. For example, in CartPole, a losing episode is typically 10 steps, and a winning episode 200 steps. weight_entropy (float): The coefficient used for the entropy regularization term (L[E]). memory_spec (Optional[dict,Memory]): The spec for the Memory to use. Should typically be a ring-buffer. """ if "policy_spec" in kwargs: policy_spec = kwargs.pop("policy_spec") policy_spec["deterministic"] = False else: policy_spec = dict(deterministic=False) super(PPOAgent, self).__init__( policy_spec=policy_spec, # Set policy to stochastic. name=kwargs.pop("name", "ppo-agent"), **kwargs) self.sample_episodes = sample_episodes # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) self.input_spaces.update( dict(actions=self.action_space.with_batch_rank(), policy_weights="variables:policy", value_function_weights="variables:value-function", deterministic=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, sequence_indices=BoolBox(add_batch_rank=True), apply_postprocessing=bool)) # The merger to merge inputs into one record Dict going into the memory. self.merger = ContainerMerger("states", "actions", "rewards", "terminals") self.memory = Memory.from_spec(memory_spec) assert isinstance( self.memory, RingBuffer ), "ERROR: PPO memory must be ring-buffer for episode-handling!" # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity, \ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!". \ format(self.observe_spec["buffer_size"], self.memory.capacity) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals") self.gae_function = GeneralizedAdvantageEstimation( gae_lambda=gae_lambda, discount=self.discount, clip_rewards=clip_rewards) self.loss_function = PPOLossFunction( clip_ratio=clip_ratio, standardize_advantages=standardize_advantages, weight_entropy=weight_entropy) self.iterations = self.update_spec["num_iterations"] self.sample_size = self.update_spec["sample_size"] self.batch_size = self.update_spec["batch_size"] # Add all our sub-components to the core. self.root_component.add_components( self.preprocessor, self.merger, self.memory, self.splitter, self.policy, self.exploration, self.loss_function, self.optimizer, self.value_function, self.value_function_optimizer, self.vars_merger, self.vars_splitter, self.gae_function) # Define the Agent's (root-Component's) API. self.define_graph_api() self.build_options = dict(vf_optimizer=self.value_function_optimizer) if self.auto_build: self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, # Important: Use sample-size, not batch-size as the sub-samples (from a batch) are the ones that get # multi-gpu-split. batch_size=self.update_spec["sample_size"], build_options=self.build_options) self.graph_built = True
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="dqn-agent", double_q=True, dueling_q=True, huber_loss=False, n_step=1, shared_container_action_target=True, memory_spec=None, ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. double_q (bool): Whether to use the double DQN loss function (see [2]). dueling_q (bool): Whether to use a dueling layer in the ActionAdapter (see [3]). huber_loss (bool) : Whether to apply a Huber loss. (see [4]). n_step (Optional[int]): n-step adjustment to discounting. memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the DQN algorithm. """ # Fix action-adapter before passing it to the super constructor. # Use a DuelingPolicy (instead of a basic Policy) if option is set. if dueling_q is True: policy_spec["type"] = "dueling-policy" # Give us some default state-value nodes. if "units_state_value_stream" not in policy_spec: policy_spec["units_state_value_stream"] = 128 super(DQNAgent, self).__init__(state_space=state_space, action_space=action_space, discount=discount, preprocessing_spec=preprocessing_spec, network_spec=network_spec, internal_states_space=internal_states_space, policy_spec=policy_spec, exploration_spec=exploration_spec, execution_spec=execution_spec, optimizer_spec=optimizer_spec, observe_spec=observe_spec, update_spec=update_spec, summary_spec=summary_spec, saver_spec=saver_spec, auto_build=auto_build, name=name) # TODO: Have to manually set it here for multi-GPU synchronizer to know its number # TODO: of return values when calling _graph_fn_calculate_update_from_external_batch. # self.root_component.graph_fn_num_outputs["_graph_fn_update_from_external_batch"] = 4 # Assert that the synch interval is a multiple of the update_interval. if self.update_spec["sync_interval"] / self.update_spec["update_interval"] != \ self.update_spec["sync_interval"] // self.update_spec["update_interval"]: raise RLGraphError( "ERROR: sync_interval ({}) must be multiple of update_interval " "({})!".format(self.update_spec["sync_interval"], self.update_spec["update_interval"])) self.double_q = double_q self.dueling_q = dueling_q self.huber_loss = huber_loss self.shared_container_action_target = shared_container_action_target # Extend input Space definitions to this Agent's specific API-methods. preprocessed_state_space = self.preprocessed_state_space.with_batch_rank( ) reward_space = FloatBox(add_batch_rank=True) terminal_space = BoolBox(add_batch_rank=True) weight_space = FloatBox(add_batch_rank=True) self.input_spaces.update( dict( actions=self.action_space.with_batch_rank(), # Weights will have a Space derived from the vars of policy. policy_weights="variables:{}".format(self.policy.scope), use_exploration=bool, preprocessed_states=preprocessed_state_space, rewards=reward_space, terminals=terminal_space, next_states=preprocessed_state_space, preprocessed_next_states=preprocessed_state_space, importance_weights=weight_space, apply_postprocessing=bool)) if self.value_function is not None: self.input_spaces[ "value_function_weights"] = "variables:{}".format( self.value_function.scope), # The replay memory. self.memory = Memory.from_spec(memory_spec) # The splitter for splitting up the records coming from the memory. self.splitter = ContainerSplitter("states", "actions", "rewards", "terminals", "next_states") # Make sure the python buffer is not larger than our memory capacity. assert self.observe_spec["buffer_size"] <= self.memory.capacity,\ "ERROR: Buffer's size ({}) in `observe_spec` must be smaller or equal to the memory's capacity ({})!".\ format(self.observe_spec["buffer_size"], self.memory.capacity) # Copy our Policy (target-net), make target-net synchronizable. self.target_policy = self.policy.copy(scope="target-policy", trainable=False) # Number of steps since the last target-net synching from the main policy. self.steps_since_target_net_sync = 0 use_importance_weights = isinstance(self.memory, PrioritizedReplay) self.loss_function = DQNLossFunction( discount=self.discount, double_q=self.double_q, huber_loss=self.huber_loss, shared_container_action_target=shared_container_action_target, importance_weights=use_importance_weights, n_step=n_step) self.root_component.add_components( self.preprocessor, self.memory, self.splitter, self.policy, self.target_policy, self.value_function, self.value_function_optimizer, # <- should both be None for DQN self.exploration, self.loss_function, self.optimizer, self.vars_merger, self.vars_splitter) # Define the Agent's (root-Component's) API. self.define_graph_api() # markup = get_graph_markup(self.graph_builder.root_component) # print(markup) if self.auto_build: self._build_graph([self.root_component], self.input_spaces, optimizer=self.optimizer, batch_size=self.update_spec["batch_size"]) self.graph_built = True