def _jit_predict_fn(model_predict, metric_fn, n_devices, jit=True): """Returns a JIT-compiled predict function (unless jit=False).""" model = tl.Serial(model_predict, metric_fn) if not jit: return model.pure_fn return tl.jit_forward(model.pure_fn, n_devices)
def _jit_predict_fn(model_predict, metric_fn, n_devices, jit=True): """Returns a JIT-compiled predict function (unless jit=False).""" model = tl.Serial(model_predict, metric_fn) model_predict = model._forward_internal # pylint: disable=protected-access if not jit: return model_predict return tl.jit_forward(model_predict, n_devices)
def _accelerate_model_with_metrics(model_with_metrics, n_devices, accelerate=True, do_mean=True): if not accelerate: return model_with_metrics.pure_fn return tl.jit_forward(model_with_metrics.pure_fn, n_devices, do_mean=do_mean)
def __init__(self, task, value_model=None, value_optimizer=None, value_lr_schedule=lr.multifactor, value_batch_size=64, value_train_steps_per_epoch=500, value_evals_per_epoch=1, value_eval_steps=1, n_shared_layers=0, added_policy_slice_length=0, n_replay_epochs=1, scale_value_targets=False, q_value=False, q_value_aggregate='max', q_value_temperature=1.0, q_value_n_samples=1, q_value_normalization=False, **kwargs): # Arguments of PolicyAgent come here. """Configures the actor-critic trainer. Args: task: `RLTask` instance to use. value_model: Model to use for the value function. value_optimizer: Optimizer to train the value model. value_lr_schedule: lr schedule for value model training. value_batch_size: Batch size for value model training. value_train_steps_per_epoch: Number of steps are we using to train the value model in each epoch. value_evals_per_epoch: Number of value trainer evaluations per RL epoch. Every evaluation, we also synchronize the weights of the target network. value_eval_steps: Number of value trainer steps per evaluation; only affects metric reporting. n_shared_layers: Number of layers to share between value and policy models. added_policy_slice_length: How much longer should slices of trajectories be for policy than for value training; this is useful for TD calculations and only affect the length of elements produced for policy batches; value batches have maximum length set by `max_slice_length` in `**kwargs`. n_replay_epochs: Number of last epochs to take into the replay buffer; only makes sense for off-policy algorithms. scale_value_targets: If `True`, scale value function targets by `1 / (1 - gamma)`. q_value: If `True`, use Q-values as baselines. q_value_aggregate: How to aggregate Q-values. Options: 'mean', 'max', 'softmax', 'logsumexp'. q_value_temperature: Temperature parameter for the 'softmax' and 'logsumexp' aggregation methods. q_value_n_samples: Number of samples to average over when calculating baselines based on Q-values. q_value_normalization: How to normalize Q-values before aggregation. Allowed values: 'std', 'abs', `None`. If `None`, don't normalize. **kwargs: Arguments for `PolicyAgent` superclass. """ self._n_shared_layers = n_shared_layers self._value_batch_size = value_batch_size self._value_train_steps_per_epoch = value_train_steps_per_epoch self._value_evals_per_epoch = value_evals_per_epoch self._value_eval_steps = value_eval_steps # The 2 below will be initalized in super.__init__ anyway, but are needed # to construct value batches which are needed before PolicyAgent init # since policy input creation calls the value model -- hence this code. self._task = task self._max_slice_length = kwargs.get('max_slice_length', 1) self._added_policy_slice_length = added_policy_slice_length self._n_replay_epochs = n_replay_epochs task.set_n_replay_epochs(n_replay_epochs) if scale_value_targets: self._value_network_scale = 1 / (1 - self._task.gamma) else: self._value_network_scale = 1 self._q_value = q_value self._q_value_aggregate = q_value_aggregate self._q_value_temperature = q_value_temperature self._q_value_n_samples = q_value_n_samples self._q_value_normalization = q_value_normalization is_discrete = isinstance(self._task.action_space, gym.spaces.Discrete) self._is_discrete = is_discrete self._vocab_size = None self._sample_all_discrete_actions = False if q_value and is_discrete: self._vocab_size = self.task.action_space.n # TODO(lukaszkaiser): the code below is specific to AWR, move it. # If n_samples = n_actions, we'll take them all in actor and reweight. if self._q_value_n_samples == self._vocab_size: # TODO(lukaszkaiser): set this explicitly once it's in AWR Trainer. self._sample_all_discrete_actions = True if q_value: value_model = functools.partial(value_model, inject_actions=True, is_discrete=is_discrete, vocab_size=self._vocab_size) self._value_eval_model = value_model(mode='eval') self._value_eval_model.init(self._value_model_signature) self._value_eval_jit = tl.jit_forward(self._value_eval_model.pure_fn, fastmath.device_count(), do_mean=False) # Initialize policy training. super().__init__(task, **kwargs) # Initialize training of the value function. value_output_dir = kwargs.get('output_dir', None) if value_output_dir is not None: value_output_dir = os.path.join(value_output_dir, 'value') # If needed, create value_output_dir and missing parent directories. if not tf.io.gfile.isdir(value_output_dir): tf.io.gfile.makedirs(value_output_dir) self._value_inputs = data.inputs.Inputs( train_stream=lambda _: self.value_batches_stream()) self._value_trainer = supervised.Trainer( model=value_model, optimizer=value_optimizer, lr_schedule=value_lr_schedule(), loss_fn=tl.L2Loss(), inputs=self._value_inputs, output_dir=value_output_dir, metrics={ 'value_loss': tl.L2Loss(), 'value_mean': self.value_mean })
def __init__(self, task, value_model=None, value_optimizer=None, value_lr_schedule=lr.MultifactorSchedule, value_batch_size=64, value_train_steps_per_epoch=500, value_evals_per_epoch=1, value_eval_steps=1, n_shared_layers=0, added_policy_slice_length=0, n_replay_epochs=1, scale_value_targets=False, q_value=False, q_value_aggregate_max=True, q_value_n_samples=1, vocab_size=2, **kwargs): # Arguments of PolicyTrainer come here. """Configures the actor-critic trainer. Args: task: `RLTask` instance to use. value_model: Model to use for the value function. value_optimizer: Optimizer to train the value model. value_lr_schedule: lr schedule for value model training. value_batch_size: Batch size for value model training. value_train_steps_per_epoch: Number of steps are we using to train the value model in each epoch. value_evals_per_epoch: Number of value trainer evaluations per RL epoch; only affects metric reporting. value_eval_steps: Number of value trainer steps per evaluation; only affects metric reporting. n_shared_layers: Number of layers to share between value and policy models. added_policy_slice_length: How much longer should slices of trajectories be for policy than for value training; this is useful for TD calculations and only affect the length of elements produced for policy batches; value batches have maximum length set by `max_slice_length` in `**kwargs`. n_replay_epochs: Number of last epochs to take into the replay buffer; only makes sense for off-policy algorithms. scale_value_targets: If `True`, scale value function targets by `1 / (1 - gamma)`. q_value: If `True`, use Q-values as baselines. q_value_aggregate_max: If `True`, aggregate Q-values with max (or mean). q_value_n_samples: Number of samples to average over when calculating baselines based on Q-values. vocab_size: Embedding vocabulary size (passed to `tl.Embedding`); used only with discrete actions and when `q_value` is `True`. **kwargs: Arguments for `PolicyTrainer` superclass. """ self._n_shared_layers = n_shared_layers self._value_batch_size = value_batch_size self._value_train_steps_per_epoch = value_train_steps_per_epoch self._value_evals_per_epoch = value_evals_per_epoch self._value_eval_steps = value_eval_steps # The 2 below will be initalized in super.__init__ anyway, but are needed # to construct value batches which are needed before PolicyTrainer init # since policy input creation calls the value model -- hence this code. self._task = task self._max_slice_length = kwargs.get('max_slice_length', 1) self._added_policy_slice_length = added_policy_slice_length self._n_replay_epochs = n_replay_epochs task.set_n_replay_epochs(n_replay_epochs) if scale_value_targets: self._value_network_scale = 1 / (1 - self._task.gamma) else: self._value_network_scale = 1 self._q_value = q_value self._q_value_aggregate_max = q_value_aggregate_max self._q_value_n_samples = q_value_n_samples self._vocab_size = vocab_size is_discrete = isinstance(self._task.action_space, gym.spaces.Discrete) # TODO(henrykm) handle the case other than Discrete/Gaussian if q_value: value_model = functools.partial(value_model, inject_actions=True, is_discrete=is_discrete, vocab_size=self._vocab_size) self._value_eval_model = value_model(mode='eval') self._value_eval_model.init(self._value_model_signature) self._value_eval_jit = tl.jit_forward(self._value_eval_model.pure_fn, math.device_count(), do_mean=False) # Initialize policy training. super(ActorCriticTrainer, self).__init__(task, **kwargs) # Initialize training of the value function. value_output_dir = kwargs.get('output_dir', None) if value_output_dir is not None: value_output_dir = os.path.join(value_output_dir, 'value') # If needed, create value_output_dir and missing parent directories. if not tf.io.gfile.isdir(value_output_dir): tf.io.gfile.makedirs(value_output_dir) self._value_inputs = supervised.Inputs( train_stream=lambda _: self.value_batches_stream()) self._value_trainer = supervised.Trainer( model=value_model, optimizer=value_optimizer, lr_schedule=value_lr_schedule, loss_fn=tl.L2Loss(), inputs=self._value_inputs, output_dir=value_output_dir, metrics={'value_loss': tl.L2Loss()})
def __init__(self, task, value_body=None, value_optimizer=None, value_lr_schedule=lr.multifactor, value_batch_size=64, value_train_steps_per_epoch=500, value_evals_per_epoch=1, value_eval_steps=1, exploration_rate=functools.partial( lr.multifactor, factors='constant * decay_every', constant=1., # pylint: disable=redefined-outer-name decay_factor=0.99, steps_per_decay=1, minimum=0.1), n_eval_episodes=0, only_eval=False, n_replay_epochs=1, max_slice_length=1, sync_freq=1000, scale_value_targets=True, output_dir=None, **kwargs): """Configures the value trainer. Args: task: RLTask instance, which defines the environment to train on. value_body: Trax layer, representing the body of the value model. functions and eval functions (a.k.a. metrics) are considered to be outside the core model, taking core model output and data labels as their two inputs. value_optimizer: the optimizer to use to train the policy model. value_lr_schedule: learning rate schedule to use to train the policy. value_batch_size: batch size used to train the policy model. value_train_steps_per_epoch: how long to train policy in each RL epoch. value_evals_per_epoch: number of policy trainer evaluations per RL epoch - only affects metric reporting. value_eval_steps: number of policy trainer steps per evaluation - only affects metric reporting. exploration_rate: exploration rate schedule - used in the policy method. n_eval_episodes: number of episodes to play with policy at temperature 0 in each epoch -- used for evaluation only only_eval: If set to True, then trajectories are collected only for for evaluation purposes, but they are not recorded. n_replay_epochs: Number of last epochs to take into the replay buffer; only makes sense for off-policy algorithms. max_slice_length: the maximum length of trajectory slices to use; it is the second dimenions of the value network output: (batch, max_slice_length, number of actions) Higher max_slice_length implies that the network has to predict more values into the future. sync_freq: frequency when to synchronize the target network with the trained network. This is necessary for training the network on bootstrapped targets, e.g. using n-step returns. scale_value_targets: If `True`, scale value function targets by `1 / (1 - gamma)`. We are trying to fix the problem with very large returns in some games in a way which does not introduce an additional hyperparameters. output_dir: Path telling where to save outputs (evals and checkpoints). **kwargs: arguments for the superclass RLTrainer. """ super(ValueAgent, self).__init__( task, n_eval_episodes=n_eval_episodes, output_dir=output_dir, **kwargs ) self._value_batch_size = value_batch_size self._value_train_steps_per_epoch = value_train_steps_per_epoch self._value_evals_per_epoch = value_evals_per_epoch self._value_eval_steps = value_eval_steps self._only_eval = only_eval self._max_slice_length = max_slice_length self._policy_dist = distributions.create_distribution(task.action_space) self._n_replay_epochs = n_replay_epochs self._exploration_rate = exploration_rate() self._sync_at = (lambda step: step % sync_freq == 0) if scale_value_targets: self._value_network_scale = 1 / (1 - self._task.gamma) else: self._value_network_scale = 1 value_model = functools.partial( models.Quality, body=value_body, n_actions=self.task.action_space.n) self._value_eval_model = value_model(mode='eval') self._value_eval_model.init(self._value_model_signature) self._value_eval_jit = tl.jit_forward( self._value_eval_model.pure_fn, fastmath.device_count(), do_mean=False) # Inputs to the value model are produced by self._values_batches_stream. self._inputs = data.inputs.Inputs( train_stream=lambda _: self.value_batches_stream()) # This is the value Trainer that will be used to train the value model. # * inputs to the trainer come from self.value_batches_stream # * outputs, targets and weights are passed to self.value_loss self._value_trainer = supervised.Trainer( model=value_model, optimizer=value_optimizer, lr_schedule=value_lr_schedule(), loss_fn=self.value_loss, inputs=self._inputs, output_dir=output_dir, metrics={'value_loss': self.value_loss, 'value_mean': self.value_mean, 'returns_mean': self.returns_mean} ) value_batch = next(self.value_batches_stream()) self._eval_model = tl.Accelerate( value_model(mode='collect'), n_devices=1) self._eval_model.init(shapes.signature(value_batch)) if self._task._initial_trajectories == 0: self._task.remove_epoch(0) self._collect_trajectories()
def __init__(self, task, value_model=None, value_optimizer=None, value_lr_schedule=lr.MultifactorSchedule, value_batch_size=64, value_train_steps_per_epoch=500, value_evals_per_epoch=1, value_eval_steps=1, n_shared_layers=0, added_policy_slice_length=0, n_replay_epochs=1, scale_value_targets=False, q_value=False, q_value_aggregate_max=True, q_value_n_samples=1, **kwargs): # Arguments of PolicyTrainer come here. """Configures the actor-critic Trainer. Args: task: RLTask instance to use value_model: the model to use for the value function value_optimizer: the optimizer to train the value model value_lr_schedule: lr schedule for value model training value_batch_size: batch size for value model training value_train_steps_per_epoch: how many steps are we using to train the value model in each epoch value_evals_per_epoch: number of value trainer evaluations per RL epoch - only affects metric reporting. value_eval_steps: number of value trainer steps per evaluation - only affects metric reporting. n_shared_layers: how many layers to share between value and policy models added_policy_slice_length: how much longer should slices of trajectories be for policy than for value training; this is useful for TD calculations and only affect the length of elements produced for policy batches; value batches have maximum length set by max_slice_length in **kwargs n_replay_epochs: how many last epochs to take into the replay buffer; only makes sense for off-policy algorithms scale_value_targets: whether to scale targets for the value function by 1 / (1 - gamma) q_value: whether to use Q-values as baselines q_value_aggregate_max: whether to aggregate Q-values with max (or mean) q_value_n_samples: number of samples to average over when calculating baselines based on Q-values **kwargs: arguments for PolicyTrainer super-class """ self._n_shared_layers = n_shared_layers self._value_batch_size = value_batch_size self._value_train_steps_per_epoch = value_train_steps_per_epoch self._value_evals_per_epoch = value_evals_per_epoch self._value_eval_steps = value_eval_steps # The 2 below will be initalized in super.__init__ anyway, but are needed # to construct value batches which are needed before PolicyTrainer init # since policy input creation calls the value model -- hence this code. self._task = task self._max_slice_length = kwargs.get('max_slice_length', 1) self._added_policy_slice_length = added_policy_slice_length self._n_replay_epochs = n_replay_epochs task.set_n_replay_epochs(n_replay_epochs) if scale_value_targets: self._value_network_scale = 1 / (1 - self._task.gamma) else: self._value_network_scale = 1 self._q_value = q_value self._q_value_aggregate_max = q_value_aggregate_max self._q_value_n_samples = q_value_n_samples if q_value: value_model = functools.partial(value_model, inject_actions=True) self._value_eval_model = value_model(mode='eval') self._value_eval_model.init(self._value_model_signature) self._value_eval_jit = tl.jit_forward(self._value_eval_model.pure_fn, math.device_count(), do_mean=False) # Initialize policy training. super(ActorCriticTrainer, self).__init__(task, **kwargs) # Initialize training of the value function. value_output_dir = kwargs.get('output_dir', None) if value_output_dir is not None: value_output_dir = os.path.join(value_output_dir, 'value') # If needed, create value_output_dir and missing parent directories. if not tf.io.gfile.isdir(value_output_dir): tf.io.gfile.makedirs(value_output_dir) self._value_inputs = supervised.Inputs( train_stream=lambda _: self.value_batches_stream()) self._value_trainer = supervised.Trainer( model=value_model, optimizer=value_optimizer, lr_schedule=value_lr_schedule, loss_fn=tl.L2Loss(), inputs=self._value_inputs, output_dir=value_output_dir, metrics={'value_loss': tl.L2Loss()})