def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.trainable_variables}, update_period=self._variable_update_period) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the run loop and return it. logger = loggers.make_default_logger('evaluator') return acme.EnvironmentLoop(environment, actor, counter=counter, logger=logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._obs_spec]) policy_network = snt.DeepRNN([ network, lambda qs: tf.cast(tf.argmax(qs, axis=-1), tf.int32), ]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.RecurrentActor( policy_network=policy_network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger( 'evaluator', save_data=True, steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter, logger)
def test_update(self): # Create two instances of the same model. actor_model = snt.nets.MLP([50, 30]) learner_model = snt.nets.MLP([50, 30]) # Create variables first. input_spec = tf.TensorSpec(shape=(28,), dtype=tf.float32) tf2_utils.create_variables(actor_model, [input_spec]) tf2_utils.create_variables(learner_model, [input_spec]) # Register them as client and source variables, respectively. actor_variables = actor_model.variables np_learner_variables = [ tf2_utils.to_numpy(v) for v in learner_model.variables ] variable_source = fakes.VariableSource(np_learner_variables) variable_client = tf2_variable_utils.VariableClient( variable_source, {'policy': actor_variables}) # Now, given some random batch of test input: x = tf.random.normal(shape=(8, 28)) # Before copying variables, the models have different outputs. actor_output = actor_model(x).numpy() learner_output = learner_model(x).numpy() self.assertFalse(np.allclose(actor_output, learner_output)) # Update the variable client. variable_client.update_and_wait() # After copying variables (by updating the client), the models are the same. actor_output = actor_model(x).numpy() learner_output = learner_model(x).numpy() self.assertTrue(np.allclose(actor_output, learner_output))
def evaluator(self, variable_source: acme.VariableSource, counter: counting.Counter): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._environment_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = acting.IMPALAActor( network=network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger( 'evaluator', steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_actor( self, policy_network: snt.Module, adder: Optional[adders.Adder] = None, variable_source: Optional[core.VariableSource] = None, deterministic_policy: Optional[bool] = False, ): """Create an actor instance.""" if variable_source: # Create the variable client responsible for keeping the actor up-to-date. variable_client = variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=1000, ) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() else: variable_client = None # Create the actor which defines how we take actions. return acting.SVG0Actor(policy_network=policy_network, adder=adder, variable_client=variable_client, deterministic_policy=deterministic_policy)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec, self._num_critic_heads) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. behavior_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticSamplingHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) policy_variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient(variable_source, policy_variables, update_period=1000) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, max_in_flight_items=self._max_in_flight_items, discount=self._additional_discount) # Create the agent. actor = actors.FeedForwardActor(policy_network=behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, actor, counter, logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Make sure observation network is defined. observation_network = agent_networks.get('observation', tf.identity) # Create a stochastic behavior policy. evaluator_network = snt.Sequential([ observation_network, agent_networks['policy'], networks.StochasticMeanHead(), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) policy_variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, policy_variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor( policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger( 'evaluator', time_delta=self._log_every, steps_key='evaluator_steps') observers = self._make_observers() if self._make_observers else () # Create the run loop and return it. return acme.EnvironmentLoop( environment, evaluator, counter, logger, observers=observers)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ): """The actor process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and behavior networks environment = self._environment_factory(False) agent_networks = self._network_factory(action_spec) # Create behavior network by adding some random dithering. behavior_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), networks.ClippedGaussian(self._sigma), ]) # Ensure network variables are created. tf2_utils.create_variables(behavior_network, [observation_spec]) variables = {'policy': behavior_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder(client=replay, n_step=self._n_step, discount=self._discount) # Create the agent. actor = actors.FeedForwardActor(behavior_network, adder=adder, variable_client=variable_client) # Create logger and counter; actors will not spam bigtable. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, time_delta=self._log_every, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_executor( self, policy_networks: Dict[str, snt.Module], adder: Optional[adders.ParallelAdder] = None, variable_source: Optional[core.VariableSource] = None, ) -> core.Executor: """Create an executor instance. Args: policy_networks (Dict[str, snt.Module]): policy networks for each agent in the system. adder (Optional[adders.ParallelAdder], optional): adder to send data to a replay buffer. Defaults to None. variable_source (Optional[core.VariableSource], optional): variables server. Defaults to None. Returns: core.Executor: system executor, a collection of agents making up the part of the system generating data by interacting the environment. """ shared_weights = self._config.shared_weights variable_client = None if variable_source: agent_keys = self._agent_types if shared_weights else self._agents # Create policy variables variables = {} for agent in agent_keys: variables[agent] = policy_networks[agent].variables # Get new policy variables variable_client = variable_utils.VariableClient( client=variable_source, variables={"policy": variables}, update_period=self._config.executor_variable_update_period, ) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the actor which defines how we take actions. return self._executor_fn( policy_networks=policy_networks, agent_specs=self._config.environment_spec.get_agent_specs(), shared_weights=shared_weights, variable_client=variable_client, adder=adder, )
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, epsilon: float, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._obs_spec]) policy_network = snt.DeepRNN([ network, lambda qs: tf.cast(trfl.epsilon_greedy(qs, epsilon).sample(), tf.int32), ]) # Component to add things into replay. sequence_length = self._burn_in_length + self._trace_length + 1 adder = adders.SequenceAdder( client=replay, period=self._replay_period, sequence_length=sequence_length, delta_encoded=True, ) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.RecurrentActor( policy_network=policy_network, variable_client=variable_client, adder=adder) counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger( 'actor', save_data=False, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" action_spec = self._environment_spec.actions observation_spec = self._environment_spec.observations # Create environment and evaluator networks environment = self._environment_factory(True) agent_networks = self._network_factory(action_spec) # Create evaluator network. evaluator_network = snt.Sequential([ agent_networks.get('observation', tf.identity), agent_networks.get('policy'), ]) # Ensure network variables are created. tf2_utils.create_variables(evaluator_network, [observation_spec]) variables = {'policy': evaluator_network.variables} # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables, update_period=self._variable_update_period) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the evaluator; note it will not add experience to replay. evaluator = actors.FeedForwardActor(evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every, steps_key='evaluator_steps') # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, epsilon: float, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._env_spec.actions) # Just inline the policy network here. policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, epsilon=epsilon).sample(), ]) tf2_utils.create_variables(policy_network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.trainable_variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = actors.FeedForwardActor(policy_network, adder, variable_client) # Create the loop to connect environment and agent. counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger('actor', save_data=False, steps_key='actor_steps') return acme.EnvironmentLoop(environment, actor, counter, logger)
def test_update_and_wait(self): # Create a variable source (emulating the learner). np_learner_variables = tf2_utils.to_numpy(self._learner_model.variables) variable_source = fakes.VariableSource(np_learner_variables) # Create a variable client (emulating the actor). variable_client = tf2_variable_utils.VariableClient( variable_source, {'policy': self._actor_model.variables}) # Create some random batch of test input: x = tf.random.normal(shape=(_BATCH_SIZE, _INPUT_SIZE)) # Before copying variables, the models have different outputs. self.assertNotAllClose(self._actor_model(x), self._learner_model(x)) # Update the variable client. variable_client.update_and_wait() # After copying variables (by updating the client), the models are the same. self.assertAllClose(self._actor_model(x), self._learner_model(x))
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'network': network.trainable_variables}, update_period=self._variable_update_period) # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, adder=adder, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" # Create environment and target networks to act with. environment = self._environment_factory(True) agent_networks = self._network_factory(self._environment_spec) # Create a stochastic behavior policy. evaluator_network = snt.Sequential([ agent_networks['observation'], agent_networks['policy'], networks.StochasticMeanHead(), ]) # Create the variable client responsible for keeping the actor up-to-date. variable_client = tf2_variable_utils.VariableClient( variable_source, variables={'policy': evaluator_network.variables}, update_period=1000) # Make sure not to evaluate a random actor by assigning variables before # running the environment loop. variable_client.update_and_wait() # Create the agent. evaluator = actors.FeedForwardActor(policy_network=evaluator_network, variable_client=variable_client) # Create logger and counter. counter = counting.Counter(counter, 'evaluator') logger = loggers.make_default_logger('evaluator', time_delta=self._log_every) # Create the run loop and return it. return acme.EnvironmentLoop(environment, evaluator, counter, logger)
def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" environment = self._environment_factory(True) network = self._network_factory(self._env_spec.actions) # Just inline the policy network here. policy_network = snt.Sequential([ network, lambda q: trfl.epsilon_greedy(q, self._evaluator_epsilon).sample(), ]) tf2_utils.create_variables(policy_network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': policy_network.trainable_variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = actors.FeedForwardActor(policy_network, variable_client=variable_client) # Create the run loop and return it. logger = loggers.make_default_logger('evaluator', steps_key='evaluator_steps') counter = counting.Counter(counter, 'evaluator') return acme.EnvironmentLoop(environment, actor, counter=counter, logger=logger)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" environment = self._environment_factory(False) network = self._network_factory(self._environment_spec.actions) tf2_utils.create_variables(network, [self._environment_spec.observations]) # Component to add things into the queue. adder = adders.SequenceAdder( client=replay, period=self._sequence_period, sequence_length=self._sequence_length) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.variables}, update_period=self._variable_update_period) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Create the agent. actor = acting.IMPALAActor( network=network, variable_client=variable_client, adder=adder) counter = counting.Counter(counter, 'actor') logger = loggers.make_default_logger( 'actor', save_data=False, steps_key='actor_steps') # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter, logger)
def make_executor( self, q_networks: Dict[str, snt.Module], action_selectors: Dict[str, Any], communication_module: BaseCommunicationModule, adder: Optional[adders.ParallelAdder] = None, variable_source: Optional[core.VariableSource] = None, trainer: Optional[training.MADQNRecurrentCommTrainer] = None, evaluator: bool = False, ) -> core.Executor: """Create an executor instance. Args: q_networks (Dict[str, snt.Module]): q-value networks for each agent in the system. action_selectors (Dict[str, Any]): policy action selector method, e.g. epsilon greedy. communication_module (BaseCommunicationModule): module for enabling communication protocols between agents. adder (Optional[adders.ParallelAdder], optional): adder to send data to a replay buffer. Defaults to None. variable_source (Optional[core.VariableSource], optional): variables server. Defaults to None. trainer (Optional[training.MADQNRecurrentCommTrainer], optional): system trainer. Defaults to None. evaluator (bool, optional): boolean indicator if the executor is used for for evaluation only. Defaults to False. Returns: core.Executor: system executor, a collection of agents making up the part of the system generating data by interacting the environment. """ shared_weights = self._config.shared_weights variable_client = None if variable_source: agent_keys = self._agent_types if shared_weights else self._agents # Create policy variables variables = {agent: q_networks[agent].variables for agent in agent_keys} # Get new policy variables variable_client = variable_utils.VariableClient( client=variable_source, variables={"q_network": variables}, update_period=self._config.executor_variable_update_period, ) # Make sure not to use a random policy after checkpoint restoration by # assigning variables before running the environment loop. variable_client.update_and_wait() # Check if we should use fingerprints fingerprint = True if self._replay_stabiliser_fn is not None else False # Create the executor which coordinates the actors. return self._executor_fn( q_networks=q_networks, action_selectors=action_selectors, shared_weights=shared_weights, variable_client=variable_client, adder=adder, trainer=trainer, communication_module=communication_module, evaluator=evaluator, fingerprint=fingerprint, )
def test_update(self): # Create a barrier to be shared between the test body and the variable # source. The barrier will block until, in this case, two threads call # wait(). Note that the (fake) variable source will call it within its # get_variables() call. barrier = threading.Barrier(2) # Create a variable source (emulating the learner). np_learner_variables = tf2_utils.to_numpy( self._learner_model.variables) variable_source = fakes.VariableSource(np_learner_variables, barrier) # Create a variable client (emulating the actor). variable_client = tf2_variable_utils.VariableClient( variable_source, {'policy': self._actor_model.variables}, update_period=_UPDATE_PERIOD) # Create some random batch of test input: x = tf.random.normal(shape=(_BATCH_SIZE, _INPUT_SIZE)) # Create variables by doing the computation once. learner_output = self._learner_model(x) actor_output = self._actor_model(x) del learner_output, actor_output for _ in range(_UPDATE_PERIOD): # Before the update period is reached, the models have different outputs. self.assertNotAllClose(self._actor_model.variables, self._learner_model.variables) # Before the update period is reached, the variable client should not make # any requests for variables. self.assertIsNone(variable_client._future) variable_client.update() # Make sure the last call created a request for variables and reset the # internal call counter. self.assertIsNotNone(variable_client._future) self.assertEqual(variable_client._call_counter, 0) future = variable_client._future for _ in range(_UPDATE_PERIOD): # Before the barrier allows the variables to be released, the models have # different outputs. self.assertNotAllClose(self._actor_model.variables, self._learner_model.variables) variable_client.update() # Make sure no new requests are made. self.assertEqual(variable_client._future, future) # Calling wait() on the barrier will now allow the variables to be copied # over from source to client. barrier.wait() # Update once more to ensure the variables are copied over. while variable_client._future is not None: variable_client.update() # After a number of update calls, the variables should be the same. self.assertAllClose(self._actor_model.variables, self._learner_model.variables)