def evaluator( self, variable_source: acme.VariableSource, counter: counting.Counter, ): """The evaluation process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'policy': network.trainable_variables}, update_period=self._variable_update_period) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the run loop and return it. logger = loggers.make_default_logger('evaluator') return acme.EnvironmentLoop(environment, actor, counter=counter, logger=logger)
def actor( self, replay: reverb.Client, variable_source: acme.VariableSource, counter: counting.Counter, ) -> acme.EnvironmentLoop: """The actor process.""" # Build environment, model, network. environment = self._environment_factory() network = self._network_factory(self._env_spec.actions) model = self._model_factory(self._env_spec) # Create variable client for communicating with the learner. tf2_utils.create_variables(network, [self._env_spec.observations]) variable_client = tf2_variable_utils.VariableClient( client=variable_source, variables={'network': network.trainable_variables}, update_period=self._variable_update_period) # Component to add things into replay. adder = adders.NStepTransitionAdder( client=replay, n_step=self._n_step, discount=self._discount, ) # Create the agent. actor = acting.MCTSActor( environment_spec=self._env_spec, model=model, network=network, discount=self._discount, adder=adder, variable_client=variable_client, num_simulations=self._num_simulations, ) # Create the loop to connect environment and agent. return acme.EnvironmentLoop(environment, actor, counter)
def __init__( self, network: snt.Module, model: models.Model, optimizer: snt.Optimizer, n_step: int, discount: float, replay_capacity: int, num_simulations: int, environment_spec: specs.EnvironmentSpec, batch_size: int, ): # Create a replay server for storing transitions. replay_table = reverb.Table( name=adders.DEFAULT_PRIORITY_TABLE, sampler=reverb.selectors.Uniform(), remover=reverb.selectors.Fifo(), max_size=replay_capacity, rate_limiter=reverb.rate_limiters.MinSize(1)) self._server = reverb.Server([replay_table], port=None) # The adder is used to insert observations into replay. address = f'localhost:{self._server.port}' adder = adders.NStepTransitionAdder( client=reverb.Client(address), n_step=n_step, discount=discount) # The dataset provides an interface to sample from replay. replay_client = reverb.TFClient(address) action_spec: specs.DiscreteArray = environment_spec.actions dataset = datasets.make_reverb_dataset( client=replay_client, environment_spec=environment_spec, extra_spec={ 'pi': specs.Array( shape=(action_spec.num_values,), dtype=np.float32) }, transition_adder=True) dataset = dataset.batch(batch_size, drop_remainder=True) tf2_utils.create_variables(network, [environment_spec.observations]) # Now create the agent components: actor & learner. actor = acting.MCTSActor( environment_spec=environment_spec, model=model, network=network, discount=discount, adder=adder, num_simulations=num_simulations, ) learner = learning.AZLearner( network=network, optimizer=optimizer, dataset=dataset, discount=discount, ) # The parent class combines these together into one 'agent'. super().__init__( actor=actor, learner=learner, min_observations=10, observations_per_step=1, )