def __init__(self, env: TFEnvironment, algorithm: OffPolicyAlgorithm, unroll_length=8, exp_replayer="uniform", observers=[], metrics=[]): """Create an OffPolicyDriver. Args: env (TFEnvironment): A TFEnvironmnet algorithm (OffPolicyAlgorithm): The algorithm for training unroll_length (int): number of time steps each environment proceeds before sending the steps to the learner queue exp_replayer (str): a string that indicates which ExperienceReplayer to use. observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). metrics (list[TFStepMetric]): An optiotional list of metrics. """ # training=False because training info is always obtained from # replayed exps instead of current time_step prediction. So _step() in # policy_driver.py has nothing to do with training for off-policy # algorithms super(SyncOffPolicyDriver, self).__init__(env=env, algorithm=algorithm, exp_replayer=exp_replayer, observers=observers, metrics=metrics, unroll_length=unroll_length, learn_queue_cap=1) algorithm.set_metrics(self.get_metrics()) self._prepare_specs(algorithm)
def __init__(self, env: TFEnvironment, algorithm: OffPolicyAlgorithm, exp_replayer="uniform", observers=[], use_rollout_state=False, metrics=[]): """Create an OffPolicyDriver. Args: env (TFEnvironment): A TFEnvoronmnet algorithm (OffPolicyAlgorithm): The algorithm for training exp_replayer (str): a string that indicates which ExperienceReplayer to use. observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). use_rollout_state (bool): Include the RNN state for the experiences used for off-policy training metrics (list[TFStepMetric]): An optiotional list of metrics. """ # training=False because training info is always obtained from # replayed exps instead of current time_step prediction. So _step() in # policy_driver.py has nothing to do with training for off-policy # algorithms super(SyncOffPolicyDriver, self).__init__(env=env, algorithm=algorithm, exp_replayer=exp_replayer, observers=observers, use_rollout_state=use_rollout_state, metrics=metrics) algorithm.set_metrics(self.get_metrics())
def __init__(self, env: TFEnvironment, algorithm: OffPolicyAlgorithm, exp_replayer: str, observers=[], use_rollout_state=False, metrics=[]): """Create an OffPolicyDriver. Args: env (TFEnvironment): A TFEnvironment algorithm (OffPolicyAlgorithm): The algorithm for training exp_replayer (str): a string that indicates which ExperienceReplayer to use. Either "one_time" or "uniform". observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). metrics (list[TFStepMetric]): An optional list of metrics. """ super(OffPolicyDriver, self).__init__( env=env, algorithm=algorithm, observers=observers, use_rollout_state=use_rollout_state, metrics=metrics, training=True, greedy_predict=False) # always use OnPolicyDriver for play/eval! self._prepare_specs(algorithm) algorithm.set_exp_replayer(exp_replayer)
def __init__(self, env: TFEnvironment, algorithm: OffPolicyAlgorithm, exp_replayer: str, num_envs=1, observers=[], metrics=[]): """Create an OffPolicyDriver. Args: env (TFEnvironment): A TFEnvironment algorithm (OffPolicyAlgorithm): The algorithm for training exp_replayer (str): a string that indicates which ExperienceReplayer to use. Either "one_time" or "uniform". num_envs (int): the number of batched environments. The total number of single environment is `num_envs * env.batch_size` observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). metrics (list[TFStepMetric]): An optional list of metrics. """ super(OffPolicyDriver, self).__init__( env=env, algorithm=algorithm, observers=observers, metrics=metrics, mode=self.OFF_POLICY_TRAINING) algorithm.set_exp_replayer(exp_replayer, num_envs * env.batch_size)
def __init__(self, env: TFEnvironment, algorithm: OffPolicyAlgorithm, exp_replayer: str, num_envs=1, observers=[], metrics=[], unroll_length=8, learn_queue_cap=1): """Create an OffPolicyDriver. Args: env (TFEnvironment): A TFEnvironment algorithm (OffPolicyAlgorithm): The algorithm for training exp_replayer (str): a string that indicates which ExperienceReplayer to use. One of "one_time", "uniform" or "cycle_one_time". num_envs (int): the number of batched environments. The total number of single environment is `num_envs * env.batch_size` observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). metrics (list[TFStepMetric]): An optional list of metrics. unroll_length (int): cycle_one_time replayer's max_length == unroll_length + 1, so that all timesteps are used in training. learn_queue_cap (int): number of actors to use in one mini-batch of training. Need to pass along to the experience replayer. """ super(OffPolicyDriver, self).__init__( env=env, algorithm=algorithm, observers=observers, metrics=metrics, mode=self.OFF_POLICY_TRAINING) algorithm.set_exp_replayer(exp_replayer, num_envs * env.batch_size, num_envs, unroll_length, learn_queue_cap)
def __init__(self, envs, algorithm: OffPolicyAlgorithm, num_actor_queues=1, unroll_length=8, learn_queue_cap=1, actor_queue_cap=1, observers=[], metrics=[], exp_replayer="one_time"): """ Args: envs (list[TFEnvironment]): list of TFEnvironment algorithm (OffPolicyAlgorithm): num_actor_queues (int): number of actor queues. Each queue is exclusively owned by just one actor thread. unroll_length (int): number of time steps each environment proceeds before sending the steps to the learner queue learn_queue_cap (int): the learner queue capacity determines how many environments contribute to the training data for each training iteration actor_queue_cap (int): the actor queue capacity determines how many environments contribute to the data for each prediction forward in an `ActorThread`. To prevent deadlock, it's required that `actor_queue_cap` * `num_actor_queues` <= `num_envs`. observers (list[Callable]): An optional list of observers that are updated after every step in the environment. Each observer is a callable(time_step.Trajectory). metrics (list[TFStepMetric]): An optional list of metrics. exp_replayer (str): a string that indicates which ExperienceReplayer to use. """ super(AsyncOffPolicyDriver, self).__init__( env=envs[0], num_envs=len(envs), algorithm=algorithm, exp_replayer=exp_replayer, observers=observers, metrics=metrics) # create threads self._coord = tf.train.Coordinator() num_envs = len(envs) policy_step_spec = PolicyStep( action=algorithm.action_spec, state=algorithm.train_state_spec, info=algorithm.rollout_info_spec) self._tfq = TFQueues( num_envs, self._env.batch_size, learn_queue_cap, actor_queue_cap, time_step_spec=algorithm.time_step_spec, policy_step_spec=policy_step_spec, unroll_length=unroll_length, num_actor_queues=num_actor_queues) actor_threads = [ ActorThread( name="actor{}".format(i), coord=self._coord, algorithm=self._algorithm, tf_queues=self._tfq, id=i) for i in range(num_actor_queues) ] env_threads = [ EnvThread( name="env{}".format(i), coord=self._coord, env=envs[i], tf_queues=self._tfq, unroll_length=unroll_length, id=i, actor_id=i % num_actor_queues) for i in range(num_envs) ] self._log_thread = LogThread( name="logging", num_envs=num_envs, env_batch_size=self._env.batch_size, observers=observers, metrics=metrics, coord=self._coord, queue=self._tfq.log_queue) self._threads = actor_threads + env_threads + [self._log_thread] algorithm.set_metrics(self.get_metrics())
def greedy_predict(self, time_step: ActionTimeStep, state=None): return OffPolicyAlgorithm.greedy_predict(self, time_step, state)