def __init__(self, clip_ratio, memory_spec=None, **kwargs): """ Args: memory_spec (Optional[dict,Memory]): The spec for the Memory to use for the PPO algorithm. """ super(PPOAgent, self).__init__(name=kwargs.pop("name", "ppo-agent"), **kwargs) self.train_time_steps = 0 # PPO uses a ring buffer. self.memory = Memory.from_spec(memory_spec) self.record_space = Dict(states=self.state_space, actions=self.action_space, rewards=float, terminals=BoolBox(), add_batch_rank=False) self.policy = Policy(network_spec=self.neural_network, action_adapter_spec=None) self.merger = DictMerger(output_space=self.record_space) splitter_input_space = copy.deepcopy(self.record_space) self.splitter = ContainerSplitter(input_space=splitter_input_space) self.loss_function = PPOLossFunction(clip_ratio=clip_ratio, discount=self.discount) self.define_graph_api() if self.auto_build: self._build_graph() self.graph_built = True
def test_policy_for_discrete_action_space(self): # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights). state_space = FloatBox(shape=(4,), add_batch_rank=True) # action_space (5 possible actions). action_space = IntBox(5, add_batch_rank=True) policy = Policy(network_spec=config_from_path("configs/test_simple_nn.json"), action_space=action_space) test = ComponentTest( component=policy, input_spaces=dict(nn_input=state_space), action_space=action_space ) policy_params = test.read_variable_values(policy.variable_registry) # Some NN inputs (4 input nodes, batch size=2). states = np.array([[-0.08, 0.4, -0.05, -0.55], [13.0, -14.0, 10.0, -16.0]]) # Raw NN-output. expected_nn_output = np.matmul(states, policy_params["policy/test-network/hidden-layer/dense/kernel"]) test.test(("get_nn_output", states), expected_outputs=dict(output=expected_nn_output), decimals=6) # Raw action layer output; Expected shape=(2,5): 2=batch, 5=action categories expected_action_layer_output = np.matmul( expected_nn_output, policy_params["policy/action-adapter/action-layer/dense/kernel"] ) expected_action_layer_output = np.reshape(expected_action_layer_output, newshape=(2, 5)) test.test(("get_action_layer_output", states), expected_outputs=dict(output=expected_action_layer_output), decimals=5) expected_actions = np.argmax(expected_action_layer_output, axis=-1) test.test(("get_action", states), expected_outputs=dict(action=expected_actions, last_internal_states=None)) # Logits, parameters (probs) and skip log-probs (numerically unstable for small probs). expected_probabilities_output = softmax(expected_action_layer_output, axis=-1) test.test(("get_logits_parameters_log_probs", states, [0, 1, 2]), expected_outputs=dict( logits=expected_action_layer_output, parameters=expected_probabilities_output, log_probs=np.log(expected_probabilities_output) ), decimals=5) print("Probs: {}".format(expected_probabilities_output)) # Deterministic sample. out = test.test(("get_deterministic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2,)) # Stochastic sample. out = test.test(("get_stochastic_action", states), expected_outputs=None) self.assertTrue(out["action"].dtype == np.int32) self.assertTrue(out["action"].shape == (2,)) # Distribution's entropy. out = test.test(("get_entropy", states), expected_outputs=None) self.assertTrue(out["entropy"].dtype == np.float32) self.assertTrue(out["entropy"].shape == (2,))
def test_sac_agent_component_on_fake_env(self): config = config_from_path("configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(2,)) continuous_action_space = FloatBox(low=-1.0, high=1.0) terminal_space = BoolBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = ValueFunction.from_spec(config["value_function"]) agent_component = SACAgentComponent( agent=None, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec(config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2 ) test = ComponentTest( component=agent_component, input_spaces=dict( states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=FloatBox(add_batch_rank=True), next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, preprocessed_s_prime=state_space.with_batch_rank(), importance_weights=FloatBox(add_batch_rank=True), preprocessed_next_states=state_space.with_batch_rank(), deterministic=bool, weights="variables:{}".format(policy.scope), # TODO: how to provide the space for multiple component variables? # q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), # ) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), ) ) policy_loss = [] vf_loss = [] # This test simulates an env that always requires actions to be close to the max-pdf # value of a loc=0.5, scale=0.2 normal, regardless of any state inputs. # The component should learn to produce actions like that (close to 0.5). true_mean = 0.5 target_dist = stats.norm(loc=true_mean, scale=0.2) batch_size = 100 for _ in range(5000): action_sample = continuous_action_space.sample(batch_size) rewards = target_dist.pdf(action_sample) result = test.test(("update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) policy_loss.append(result["actor_loss"]) vf_loss.append(result["critic_loss"]) self.assertTrue(np.mean(policy_loss[:100]) > np.mean(policy_loss[-100:])) self.assertTrue(np.mean(vf_loss[:100]) > np.mean(vf_loss[-100:])) action_sample = np.linspace(-1, 1, batch_size) q_values = test.test(("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: q_val = q_val.flatten() np.testing.assert_allclose(q_val, target_dist.pdf(action_sample), atol=0.2) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) action_sample = action_sample.flatten() np.testing.assert_allclose(np.mean(action_sample), true_mean, atol=0.1)
def __init__(self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, policy_spec=None, value_function_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, value_function_optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent"): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. policy_spec (Optional[dict]): An optional dict for further kwargs passing into the Policy c'tor. value_function_spec (list): Neural network specification for baseline. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. value_function_optimizer_spec (dict): Optimizer config for value function otpimizer. If None, the optimizer spec for the policy is used (same learning rate and optimizer type). observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.flat_state_space = self.state_space.flatten() if isinstance( self.state_space, ContainerSpace) else None self.logger.info("Parsed state space definition: {}".format( self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank( False) self.flat_action_space = self.action_space.flatten() if isinstance( self.action_space, ContainerSpace) else None self.logger.info("Parsed action space definition: {}".format( self.action_space)) self.discount = discount self.build_options = {} # The agent's root-Component. self.root_component = Component(name=self.name, nesting_level=0) # Define the input-Spaces: # Tag the input-Space to `self.set_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict(states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space( self.state_space) self.preprocessing_required = preprocessing_spec is not None and len( preprocessing_spec) > 0 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info( "Parsed preprocessed-state space definition: {}".format( self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. policy_spec = policy_spec or dict() if "network_spec" not in policy_spec: policy_spec["network_spec"] = network_spec if "action_space" not in policy_spec: policy_spec["action_space"] = self.action_space self.policy_spec = policy_spec # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy.from_spec(self.policy_spec) # Done by default. self.policy.add_components(Synchronizable(), expose_apis="sync") # Create non-shared baseline network. self.value_function = None if value_function_spec is not None: self.value_function = ValueFunction( network_spec=value_function_spec) self.value_function.add_components(Synchronizable(), expose_apis="sync") self.vars_merger = ContainerMerger("policy", "vf", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", "vf", scope="variable-container-splitter") else: self.vars_merger = ContainerMerger("policy", scope="variable-dict-merger") self.vars_splitter = ContainerSplitter( "policy", scope="variable-container-splitter") self.internal_states_space = Space.from_spec(internal_states_space) # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" def factory_(i): if i < 2: return [] return tuple([[] for _ in range(i)]) self.states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.actions_buffer = defaultdict( partial(factory_, len(self.flat_action_space or []))) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict( list) # partial(fact_, len(self.flat_state_space))) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: # Save spec in case agent needs to create more optimizers e.g. for baseline. self.optimizer_spec = optimizer_spec self.optimizer = Optimizer.from_spec(optimizer_spec) self.value_function_optimizer = None if self.value_function is not None: if value_function_optimizer_spec is None: vf_optimizer_spec = self.optimizer_spec else: vf_optimizer_spec = value_function_optimizer_spec vf_optimizer_spec["scope"] = "value-function-optimizer" self.value_function_optimizer = Optimizer.from_spec( vf_optimizer_spec) # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec) # type: GraphExecutor
def test_sac_agent_component_functionality(self): config = config_from_path( "configs/sac_component_for_fake_env_test.json") # Arbitrary state space, state should not be used in this example. state_space = FloatBox(shape=(8, )) continuous_action_space = FloatBox(shape=(1, ), low=-2.0, high=2.0) terminal_space = BoolBox(add_batch_rank=True) rewards_space = FloatBox(add_batch_rank=True) policy = Policy.from_spec(config["policy"], action_space=continuous_action_space) policy.add_components(Synchronizable(), expose_apis="sync") q_function = ValueFunction.from_spec(config["value_function"]) agent_component = SACAgentComponent( agent=None, policy=policy, q_function=q_function, preprocessor=PreprocessorStack.from_spec([]), memory=ReplayMemory.from_spec(config["memory"]), discount=config["discount"], initial_alpha=config["initial_alpha"], target_entropy=None, optimizer=AdamOptimizer.from_spec(config["optimizer"]), vf_optimizer=AdamOptimizer.from_spec( config["value_function_optimizer"], scope="vf-optimizer"), alpha_optimizer=None, q_sync_spec=SyncSpecification(sync_interval=10, sync_tau=1.0), num_q_functions=2) test = ComponentTest( component=agent_component, input_spaces=dict( states=state_space.with_batch_rank(), preprocessed_states=state_space.with_batch_rank(), env_actions=continuous_action_space.with_batch_rank(), actions=continuous_action_space.with_batch_rank(), rewards=rewards_space, next_states=state_space.with_batch_rank(), terminals=terminal_space, batch_size=int, preprocessed_s_prime=state_space.with_batch_rank(), importance_weights=FloatBox(add_batch_rank=True), preprocessed_next_states=state_space.with_batch_rank(), deterministic=bool, weights="variables:{}".format(policy.scope), # TODO: how to provide the space for multiple component variables? #q_weights=Dict( # q_0="variables:{}".format(q_function.scope), # q_1="variables:{}".format(agent_component._q_functions[1].scope), #) ), action_space=continuous_action_space, build_kwargs=dict( optimizer=agent_component._optimizer, build_options=dict( vf_optimizer=agent_component.vf_optimizer, ), )) batch_size = 10 action_sample = continuous_action_space.with_batch_rank().sample( batch_size) rewards = rewards_space.sample(batch_size) # Check, whether an update runs ok. result = test.test(( "update_from_external_batch", [ state_space.sample(batch_size), action_sample, rewards, [True] * batch_size, state_space.sample(batch_size), [1.0] * batch_size # importance ])) self.assertTrue(result["actor_loss"].dtype == np.float32) self.assertTrue(result["critic_loss"].dtype == np.float32) action_sample = np.linspace(-1, 1, batch_size).reshape((batch_size, 1)) q_values = test.test( ("get_q_values", [state_space.sample(batch_size), action_sample])) for q_val in q_values: self.assertTrue(q_val.dtype == np.float32) self.assertTrue(q_val.shape == (batch_size, 1)) action_sample, _ = test.test(("action_from_preprocessed_state", [state_space.sample(batch_size), False])) self.assertTrue(action_sample.dtype == np.float32) self.assertTrue(action_sample.shape == (batch_size, 1))
def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, action_adapter_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent" ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. action_adapter_spec (Optional[dict,ActionAdapter]): The spec-dict for the ActionAdapter Component or the ActionAdapter object itself. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.logger.info("Parsed state space definition: {}".format(self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank(False) self.logger.info("Parsed action space definition: {}".format(self.action_space)) self.discount = discount # The agent's root-Component. self.root_component = Component(name=self.name) # Define the input-Spaces: # Tag the input-Space to `self.set_policy_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict( states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(self.state_space) self.preprocessing_required = preprocessing_spec is not None and len(preprocessing_spec) > 1 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info("Parsed preprocessed-state space definition: {}".format(self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. self.neural_network = None if network_spec is not None: self.neural_network = NeuralNetwork.from_spec(network_spec) self.action_adapter_spec = action_adapter_spec self.internal_states_space = internal_states_space # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None # The action adapter mapping raw NN output to (shaped) actions. action_adapter_dict = dict(action_space=self.action_space) if self.action_adapter_spec is None: self.action_adapter_spec = action_adapter_dict else: self.action_adapter_spec.update(action_adapter_dict) # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy( network_spec=self.neural_network, action_adapter_spec=self.action_adapter_spec ) self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" self.states_buffer = defaultdict(list) self.actions_buffer = defaultdict(list) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict(list) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) if self.observe_spec["buffer_enabled"]: self.reset_env_buffers() # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: self.optimizer = Optimizer.from_spec(optimizer_spec) #get_optimizer_from_device_strategy( #optimizer_spec, self.execution_spec.get("device_strategy", 'default') # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec ) # type: GraphExecutor
class Agent(Specifiable): """ Generic agent defining RLGraph-API operations and parses and sanitizes configuration specs. """ def __init__( self, state_space, action_space, discount=0.98, preprocessing_spec=None, network_spec=None, internal_states_space=None, action_adapter_spec=None, exploration_spec=None, execution_spec=None, optimizer_spec=None, observe_spec=None, update_spec=None, summary_spec=None, saver_spec=None, auto_build=True, name="agent" ): """ Args: state_space (Union[dict,Space]): Spec dict for the state Space or a direct Space object. action_space (Union[dict,Space]): Spec dict for the action Space or a direct Space object. preprocessing_spec (Optional[list,PreprocessorStack]): The spec list for the different necessary states preprocessing steps or a PreprocessorStack object itself. discount (float): The discount factor (gamma). network_spec (Optional[list,NeuralNetwork]): Spec list for a NeuralNetwork Component or the NeuralNetwork object itself. internal_states_space (Optional[Union[dict,Space]]): Spec dict for the internal-states Space or a direct Space object for the Space(s) of the internal (RNN) states. action_adapter_spec (Optional[dict,ActionAdapter]): The spec-dict for the ActionAdapter Component or the ActionAdapter object itself. exploration_spec (Optional[dict]): The spec-dict to create the Exploration Component. execution_spec (Optional[dict,Execution]): The spec-dict specifying execution settings. optimizer_spec (Optional[dict,Optimizer]): The spec-dict to create the Optimizer for this Agent. observe_spec (Optional[dict]): Spec-dict to specify `Agent.observe()` settings. update_spec (Optional[dict]): Spec-dict to specify `Agent.update()` settings. summary_spec (Optional[dict]): Spec-dict to specify summary settings. saver_spec (Optional[dict]): Spec-dict to specify saver settings. auto_build (Optional[bool]): If True (default), immediately builds the graph using the agent's graph builder. If false, users must separately call agent.build(). Useful for debugging or analyzing components before building. name (str): Some name for this Agent object. """ super(Agent, self).__init__() self.name = name self.auto_build = auto_build self.graph_built = False self.logger = logging.getLogger(__name__) self.state_space = Space.from_spec(state_space).with_batch_rank(False) self.logger.info("Parsed state space definition: {}".format(self.state_space)) self.action_space = Space.from_spec(action_space).with_batch_rank(False) self.logger.info("Parsed action space definition: {}".format(self.action_space)) self.discount = discount # The agent's root-Component. self.root_component = Component(name=self.name) # Define the input-Spaces: # Tag the input-Space to `self.set_policy_weights` as equal to whatever the variables-Space will be for # the Agent's policy Component. self.input_spaces = dict( states=self.state_space.with_batch_rank(), ) # Construct the Preprocessor. self.preprocessor = PreprocessorStack.from_spec(preprocessing_spec) self.preprocessed_state_space = self.preprocessor.get_preprocessed_space(self.state_space) self.preprocessing_required = preprocessing_spec is not None and len(preprocessing_spec) > 1 if self.preprocessing_required: self.logger.info("Preprocessing required.") self.logger.info("Parsed preprocessed-state space definition: {}".format(self.preprocessed_state_space)) else: self.logger.info("No preprocessing required.") # Construct the Policy network. self.neural_network = None if network_spec is not None: self.neural_network = NeuralNetwork.from_spec(network_spec) self.action_adapter_spec = action_adapter_spec self.internal_states_space = internal_states_space # An object implementing the loss function interface is only strictly needed # if automatic device strategies like multi-gpu are enabled. This is because # the device strategy needs to know the name of the loss function to infer the appropriate # operations. self.loss_function = None # The action adapter mapping raw NN output to (shaped) actions. action_adapter_dict = dict(action_space=self.action_space) if self.action_adapter_spec is None: self.action_adapter_spec = action_adapter_dict else: self.action_adapter_spec.update(action_adapter_dict) # The behavioral policy of the algorithm. Also the one that gets updated. self.policy = Policy( network_spec=self.neural_network, action_adapter_spec=self.action_adapter_spec ) self.exploration = Exploration.from_spec(exploration_spec) self.execution_spec = parse_execution_spec(execution_spec) # Python-side experience buffer for better performance (may be disabled). self.default_env = "env_0" self.states_buffer = defaultdict(list) self.actions_buffer = defaultdict(list) self.internals_buffer = defaultdict(list) self.rewards_buffer = defaultdict(list) self.next_states_buffer = defaultdict(list) self.terminals_buffer = defaultdict(list) self.observe_spec = parse_observe_spec(observe_spec) if self.observe_spec["buffer_enabled"]: self.reset_env_buffers() # Global time step counter. self.timesteps = 0 # Create the Agent's optimizer based on optimizer_spec and execution strategy. self.optimizer = None if optimizer_spec is not None: self.optimizer = Optimizer.from_spec(optimizer_spec) #get_optimizer_from_device_strategy( #optimizer_spec, self.execution_spec.get("device_strategy", 'default') # Update-spec dict tells the Agent how to update (e.g. memory batch size). self.update_spec = parse_update_spec(update_spec) # Create our GraphBuilder and -Executor. self.graph_builder = GraphBuilder(action_space=self.action_space, summary_spec=summary_spec) self.graph_executor = GraphExecutor.from_spec( get_backend(), graph_builder=self.graph_builder, execution_spec=self.execution_spec, saver_spec=saver_spec ) # type: GraphExecutor def reset_env_buffers(self, env_id=None): """ Resets an environment buffer for buffered `observe` calls. Args: env_id (Optional[str]): Environment id to reset. Defaults to a default environment if None provided. """ if env_id is None: env_id = self.default_env self.states_buffer[env_id] = [] self.actions_buffer[env_id] = [] self.internals_buffer[env_id] = [] self.rewards_buffer[env_id] = [] self.next_states_buffer[env_id] = [] self.terminals_buffer[env_id] = [] # TODO optimizer scope missing? def define_graph_api(self, policy_scope, pre_processor_scope, *params): """ Can be used to specify and then `self.define_api_method` the Agent's CoreComponent's API methods. Each agent implements this to build its algorithm logic. Args: policy_scope (str): The global scope of the Policy within the Agent. pre_processor_scope (str): The global scope of the PreprocessorStack within the Agent. params (any): Params to be used freely by child Agent implementations. """ # Done by default. # TODO: Move this to ctor as this belongs to the init phase and doesn't really have to do with API-methods. self.policy.add_components(Synchronizable(), expose_apis="sync") # Add api methods for syncing. @rlgraph_api(component=self.root_component) def get_policy_weights(self): policy = self.get_sub_component_by_name(policy_scope) return policy._variables() @rlgraph_api(component=self.root_component, must_be_complete=False) def set_policy_weights(self, weights): policy = self.get_sub_component_by_name(policy_scope) return policy.sync(weights) # To pre-process external data if needed. @rlgraph_api(component=self.root_component) def preprocess_states(self, states): preprocessor_stack = self.get_sub_component_by_name(pre_processor_scope) preprocessed_states = preprocessor_stack.preprocess(states) return preprocessed_states def _build_graph(self, root_components, input_spaces, **kwargs): """ Builds the internal graph from the RLGraph meta-graph via the graph executor.. """ return self.graph_executor.build(root_components, input_spaces, **kwargs) def build(self, build_options=None): """ Builds this agent. This method call only be called if the agent parameter "auto_build" was set to False. Args: build_options (Optional[dict]): Optional build options, see build doc. """ assert not self.graph_built,\ "ERROR: Attempting to build agent which has already been built. Ensure auto_build parameter is set to " \ "False (was {}), and method has not been called twice".format(self.auto_build) # TODO let agent have a list of root-components return self._build_graph( [self.root_component], self.input_spaces, optimizer=self.optimizer, build_options=build_options, batch_size=self.update_spec["batch_size"] ) def preprocess_states(self, states): """ Applies the agent's preprocessor to one or more states, e.g. to preprocess external data before inserting to memory without acting. Returns identity if no preprocessor defined. Args: states (np.array): State(s) to preprocess. Returns: np.array: Preprocessed states. """ if self.preprocessing_required: return self.call_api_method("preprocess_states", states) else: # Return identity. return states def get_action(self, states, internals=None, use_exploration=True, apply_preprocessing=True, extra_returns=None): """ Returns action(s) for the passed state(s). If `states` is a single state, returns a single action, otherwise, returns a batch of actions, where batch-size = number of states passed in. Args: states (Union[dict,np.ndarray]): States dict/tuple or numpy array. internals (Union[dict,np.ndarray]): Internal states dict/tuple or numpy array. use_exploration (bool): If False, no exploration or sampling may be applied when retrieving an action. apply_preprocessing (bool): If True, apply any state preprocessors configured to the action. Set to false if all pre-processing is handled externally both for acting and updating. extra_returns (Optional[Set[str]]): Optional set of Agent-specific strings for additional return values (besides the actions). All Agents must support "preprocessed_states". Returns: any: Action(s) as dict/tuple/np.ndarray (depending on `self.action_space`). Optional: The preprocessed states as a 2nd return value. """ raise NotImplementedError def observe(self, preprocessed_states, actions, internals, rewards, next_states, terminals, env_id=None): """ Observes an experience tuple or a batch of experience tuples. Note: If configured, first uses buffers and then internally calls _observe_graph() to actually run the computation graph. If buffering is disabled, this just routes the call to the respective `_observe_graph()` method of the child Agent. Args: preprocessed_states (Union[dict, ndarray]): Preprocessed states dict or array. actions (Union[dict, ndarray]): Actions dict or array containing actions performed for the given state(s). internals (Union[list]): Internal state(s) returned by agent for the given states.Must be empty list if no internals available. rewards (float): Scalar reward(s) observed. terminals (bool): Boolean indicating terminal. next_states (Union[dict, ndarray]): Preprocessed next states dict or array. env_id (Optional[str]): Environment id to observe for. When using vectorized execution and buffering, using environment ids is necessary to ensure correct trajectories are inserted. See `SingleThreadedWorker` for example usage. """ batched_states = self.preprocessed_state_space.force_batch(preprocessed_states) # Check for illegal internals. if internals is None: internals = [] # Add batch rank? if batched_states.ndim == np.asarray(preprocessed_states).ndim + 1: preprocessed_states = np.asarray([preprocessed_states]) actions = np.asarray([actions]) internals = np.asarray([internals]) rewards = np.asarray([rewards]) terminals = np.asarray([terminals]) # Also batch next_states (or already done?). if next_states.ndim == preprocessed_states.ndim - 1: next_states = np.asarray([next_states]) if self.observe_spec["buffer_enabled"] is True: if env_id is None: env_id = self.default_env self.states_buffer[env_id].extend(preprocessed_states) self.actions_buffer[env_id].extend(actions) self.internals_buffer[env_id].extend(internals) self.rewards_buffer[env_id].extend(rewards) self.next_states_buffer[env_id].extend(next_states) self.terminals_buffer[env_id].extend(terminals) buffer_is_full = len(self.rewards_buffer[env_id]) >= self.observe_spec["buffer_size"] # If the buffer (per environment) is full OR the episode was aborted: # Change terminal of last record artificially to True, insert and flush the buffer. if buffer_is_full or self.terminals_buffer[env_id][-1]: self.terminals_buffer[env_id][-1] = True # TODO: Apply n-step post-processing if necessary. # if self.observe_spec["n_step"] > 1: # pass self._observe_graph( preprocessed_states=np.asarray(self.states_buffer[env_id]), actions=np.asarray(self.actions_buffer[env_id]), internals=np.asarray(self.internals_buffer[env_id]), rewards=np.asarray(self.rewards_buffer[env_id]), next_states=np.asarray(self.next_states_buffer[env_id]), terminals=np.asarray(self.terminals_buffer[env_id]) ) self.reset_env_buffers(env_id) else: self._observe_graph(preprocessed_states, actions, internals, rewards, next_states, terminals) def _observe_graph(self, preprocessed_states, actions, internals, rewards, next_states, terminals): """ This methods defines the actual call to the computational graph by executing the respective graph op via the graph executor. Since this may use varied underlying components and api_methods, every agent defines which ops it may want to call. The buffered observer calls this method to move data into the graph. Args: preprocessed_states (Union[dict,ndarray]): Preprocessed states dict or array. actions (Union[dict,ndarray]): Actions dict or array containing actions performed for the given state(s). internals (Union[list]): Internal state(s) returned by agent for the given states. Must be an empty list if no internals available. rewards (Union[ndarray,list,float]): Scalar reward(s) observed. next_states (Union[dict, ndarray]): Preprocessed next states dict or array. terminals (Union[list,bool]): Boolean indicating terminal. """ raise NotImplementedError def update(self, batch=None): """ Performs an update on the computation graph either via externally experience or by sampling from an internal memory. Args: batch (Optional[dict]): Optional external data batch to use for update. If None, the agent should be configured to sample internally. Returns: float: The loss value calculated in this update. """ raise NotImplementedError def import_observations(self, observations): """ Bulk imports observations, potentially using device pre-fetching. Can be optionally implemented by agents requiring pre-training. Args: observations (dict): Dict or list of observation data. """ pass def reset(self): """ Must be implemented to define some reset behavior (before starting a new episode). This could include resetting the preprocessor and other Components. """ pass # optional def terminate(self): """ Terminates the Agent, so it will no longer be usable. Things that need to be cleaned up should be placed into this function, e.g. closing sessions and other open connections. """ self.graph_executor.terminate() def call_api_method(self, op, inputs=None, return_ops=None): """ Utility method to call any desired api method on the graph, identified via output socket. Delegate this call to the RLGraph graph executor. Args: op (str): Name of the api method. inputs (Optional[dict,np.array]): Dict specifying the provided api_methods for (key=input space name, values=the values that should go into this space (e.g. numpy arrays)). Returns: any: Result of the op call. """ return self.graph_executor.execute((op, inputs, return_ops)) def export_graph(self, filename=None): """ Any algorithm defined as a full-graph, as opposed to mixed (mixed Python and graph control flow) should be able to export its graph for deployment. Args: filename (str): Export path. Depending on the backend, different filetypes may be required. """ self.graph_executor.export_graph_definition(filename) def store_model(self, path=None, add_timestep=True): """ Store model using the backend's check-pointing mechanism. Args: path (str): Path to model directory. add_timestep (bool): Indiciates if current training step should be appended to exported model. If false, may override previous checkpoints. """ self.graph_executor.store_model(path=path, add_timestep=add_timestep) def load_model(self, path=None): """ Load model from serialized format. Args: path (str): Path to checkpoint directory. """ self.graph_executor.load_model(path=path) def get_policy_weights(self): """ Returns all weights relevant for the agent's policy for syncing purposes. Returns: any: Weights and optionally weight meta data for this model. """ return dict(self.graph_executor.execute("get_policy_weights")) def set_policy_weights(self, weights): """ Sets policy weights of this agent, e.g. for external syncing purporses. Args: weights (any): Weights and optionally meta data to update depending on the backend. Raises: ValueError if weights do not match graph weights in shapes and types. """ return self.graph_executor.execute(("set_policy_weights", weights))