def __init__(self, name, args, env_args, sess_config=None, save=True, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): super().__init__(name, args, env_args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device) del self.buffer outside_value = float(args['ac']['policy_end_lr']) points = [(0, float(args['ac']['policy_lr'])), (args['ac']['policy_decay_steps'], outside_value)] self.policy_lr_scheduler = PiecewiseSchedule( points, outside_value=outside_value) outside_value = float(args['ac']['value_end_lr']) points = [(0, float(args['ac']['value_lr'])), (args['ac']['value_decay_steps'], outside_value)] self.value_lr_scheduler = PiecewiseSchedule( points, outside_value=outside_value)
def __init__(self, args, state_shape, action_dim): super().__init__(args, state_shape, action_dim) self.data_structure = None # params for prioritized replay self.alpha = float(args['alpha']) if 'alpha' in args else .5 self.beta = float(args['beta0']) if 'beta0' in args else .4 self.beta_schedule = PiecewiseSchedule([(0, args['beta0']), (float(args['beta_steps']), 1.)], outside_value=1.) self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4 self.top_priority = 2. self.to_update_priority = args['to_update_priority'] if 'to_update_priority' in args else True self.sample_i = 0 # count how many times self.sample is called init_buffer(self.memory, self.capacity, state_shape, action_dim, self.n_steps == 1) # Code for single agent if self.n_steps > 1: self.tb_capacity = args['tb_capacity'] self.tb_idx = 0 self.tb_full = False self.tb = {} init_buffer(self.tb, self.tb_capacity, state_shape, action_dim, True)
def _add_attributes(self): super()._add_attributes() self._top_priority = 1. self._data_structure = None self._use_is_ratio = getattr(self, '_use_is_ratio', True) self._beta = float(getattr(self, 'beta0', .4)) if getattr(self, '_beta_schedule', None): assert isinstance(self._beta_schedule, list) self._beta_schedule = PiecewiseSchedule(self._beta_schedule) self._sample_i = 0 # count how many times self._sample is called
def __init__(self, args, state_space, action_dim): super().__init__(args, state_space, action_dim) self.data_structure = None # params for prioritized replay self.alpha = float(args['alpha']) if 'alpha' in args else .5 self.beta = float(args['beta0']) if 'beta0' in args else .4 self.beta_schedule = PiecewiseSchedule( [(0, args['beta0']), (float(args['beta_steps']), 1.)], outside_value=1.) self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4 self.top_priority = 2. self.sample_i = 0 # count how many times self.sample is called
def _schedule_act_epsilon(self, env): """ Schedules action epsilon """ if self._schedule_act_eps: if isinstance(self._act_eps, (list, tuple)): logger.info(f'Schedule action epsilon: {self._act_eps}') self._act_eps = PiecewiseSchedule(self._act_eps) else: self._act_eps = compute_act_eps( self._act_eps_type, self._act_eps, getattr(self, '_id', None), getattr(self, '_n_workers', getattr(env, 'n_workers', 1)), env.n_envs) if env.action_shape != (): self._act_eps = self._act_eps.reshape(-1, 1) self._schedule_act_eps = False # not run-time scheduling print('Action epsilon:', np.reshape(self._act_eps, -1)) if not isinstance(getattr(self, '_act_eps', None), PiecewiseSchedule): self._act_eps = tf.convert_to_tensor(self._act_eps, tf.float32)
def __init__(self, name, args, env_args, buffer_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): self.critic_loss_type = args['critic']['loss_type'] self.polyak = args['polyak'] if 'polyak' in args else .995 # learning rate schedule self.schedule_lr = 'schedule_lr' in args and args['schedule_lr'] if self.schedule_lr: self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (150000, 1e-4), (300000, 5e-5)], outside_value=5e-5) self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (150000, 3e-4), (300000, 5e-5)], outside_value=5e-5) super().__init__(name, args, env_args, buffer_args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device)
class ActionScheduler: def _setup_action_schedule(self, env): # eval action epsilon and temperature self._eval_act_eps = tf.convert_to_tensor( getattr(self, '_eval_act_eps', 0), tf.float32) self._eval_act_temp = tf.convert_to_tensor( getattr(self, '_eval_act_temp', .5), tf.float32) self._schedule_act_eps = getattr(self, '_schedule_act_eps', False) self._schedule_act_temp = getattr(self, '_schedule_act_temp', False) self._schedule_act_epsilon(env) self._schedule_act_temperature(env) def _schedule_act_epsilon(self, env): """ Schedules action epsilon """ if self._schedule_act_eps: if isinstance(self._act_eps, (list, tuple)): logger.info(f'Schedule action epsilon: {self._act_eps}') self._act_eps = PiecewiseSchedule(self._act_eps) else: self._act_eps = compute_act_eps( self._act_eps_type, self._act_eps, getattr(self, '_id', None), getattr(self, '_n_workers', getattr(env, 'n_workers', 1)), env.n_envs) if env.action_shape != (): self._act_eps = self._act_eps.reshape(-1, 1) self._schedule_act_eps = False # not run-time scheduling print('Action epsilon:', np.reshape(self._act_eps, -1)) if not isinstance(getattr(self, '_act_eps', None), PiecewiseSchedule): self._act_eps = tf.convert_to_tensor(self._act_eps, tf.float32) def _schedule_act_temperature(self, env): """ Schedules action temperature """ if self._schedule_act_temp: self._act_temp = compute_act_temp( self._min_temp, self._max_temp, getattr(self, '_n_exploit_envs', 0), getattr(self, '_id', None), getattr(self, '_n_workers', getattr(env, 'n_workers', 1)), env.n_envs) self._act_temp = self._act_temp.reshape(-1, 1) self._schedule_act_temp = False # not run-time scheduling else: self._act_temp = getattr(self, '_act_temp', 1) print('Action temperature:', np.reshape(self._act_temp, -1)) self._act_temp = tf.convert_to_tensor(self._act_temp, tf.float32) def _get_eps(self, evaluation): """ Gets action epsilon """ if evaluation: eps = self._eval_act_eps else: if self._schedule_act_eps: eps = self._act_eps.value(self.env_step) self.store(act_eps=eps) eps = tf.convert_to_tensor(eps, tf.float32) else: eps = self._act_eps return eps def _get_temp(self, evaluation): """ Gets action temperature """ return self._eval_act_temp if evaluation else self._act_temp
class Learner(Agent): def __init__(self, name, args, env_args, sess_config=None, save=True, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): super().__init__(name, args, env_args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device) del self.buffer outside_value = float(args['ac']['policy_end_lr']) points = [(0, float(args['ac']['policy_lr'])), (args['ac']['policy_decay_steps'], outside_value)] self.policy_lr_scheduler = PiecewiseSchedule( points, outside_value=outside_value) outside_value = float(args['ac']['value_end_lr']) points = [(0, float(args['ac']['value_lr'])), (args['ac']['value_decay_steps'], outside_value)] self.value_lr_scheduler = PiecewiseSchedule( points, outside_value=outside_value) def apply_gradients(self, timestep, *grads): policy_lr = self.policy_lr_scheduler.value(timestep) val_lr = self.value_lr_scheduler.value(timestep) print('policy learning rate:', policy_lr) print('value learning rate:', val_lr) grads = np.mean(grads, axis=0) feed_dict = {g_var: g for g_var, g in zip(self.ac.grads, grads)} feed_dict.update({self.ac.policy_lr: policy_lr, self.ac.v_lr: val_lr}) fetches = [self.ac.opt_step] fetches.append([self.ac.policy_optop, self.ac.v_optop]) # do not log_tensorboard, use record_stats if required learn_step, _ = self.sess.run(fetches, feed_dict=feed_dict) if hasattr(self, 'saver') and learn_step % 100 == 0: self.save() return self.get_weights() def get_weights(self): return self.variables.get_flat() def record_stats(self, score_mean, score_std, epslen_mean, entropy, approx_kl, clip_frac): log_info = dict(score_mean=score_mean, score_std=score_std, epslen_mean=epslen_mean, entropy=entropy, approx_kl=approx_kl, clip_frac=clip_frac) # a wraper since ray does not support (*args) super().record_stats(**log_info) def print_construction_complete(self): pwc('Learner has been constructed.', color='cyan')
class Agent(OffPolicyOperation): """ Interface """ def __init__(self, name, args, env_args, buffer_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): self.critic_loss_type = args['critic']['loss_type'] self.polyak = args['polyak'] if 'polyak' in args else .995 # learning rate schedule self.schedule_lr = 'schedule_lr' in args and args['schedule_lr'] if self.schedule_lr: self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (150000, 1e-4), (300000, 5e-5)], outside_value=5e-5) self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (150000, 3e-4), (300000, 5e-5)], outside_value=5e-5) super().__init__(name, args, env_args, buffer_args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device) @property def main_variables(self): return self.actor.trainable_variables + self.critic.trainable_variables @property def target_variables(self): return self.target_actor.trainable_variables + self.target_critic.trainable_variables """ Implementation """ def _build_graph(self): if self.device and 'GPU' in self.device: with tf.device('/CPU: 0'): self.data = self._prepare_data(self.buffer) else: self.data = self._prepare_data(self.buffer) self.actor, self.critic, self.target_actor, self.target_critic = self._create_main_target_actor_critic( ) self.action_det = self.action = self.actor.action self._compute_loss() _, self.actor_lr, self.opt_step, _, self.actor_opt_op = self.actor._optimization_op( self.actor_loss, opt_step=True, schedule_lr=self.schedule_lr) _, self.critic_lr, _, _, self.critic_opt_op = self.critic._optimization_op( self.critic_loss, schedule_lr=self.schedule_lr) self.opt_op = tf.group(self.actor_opt_op, self.critic_opt_op) # target net operations self.init_target_op, self.update_target_op = self._target_net_ops() self._log_loss() def _create_main_target_actor_critic(self): # main actor-critic actor, critic = self._create_actor_critic(is_target=False) # target actor-critic target_actor, target_critic = self._create_actor_critic(is_target=True) return actor, critic, target_actor, target_critic def _create_actor_critic(self, is_target): log_tensorboard = False if is_target else self.log_tensorboard log_params = False if is_target else self.log_params scope_name = 'target' if is_target else 'main' state = self.data['next_state'] if is_target else self.data['state'] scope_prefix = self.name + '/' + scope_name self.args['actor'][ 'max_action_repetitions'] = self.max_action_repetitions with tf.variable_scope(scope_name): actor = Actor('actor', self.args['actor'], self.graph, state, self.action_dim, scope_prefix=scope_prefix, log_tensorboard=log_tensorboard, log_params=log_params) critic = DoubleCritic('critic', self.args['critic'], self.graph, state, self.data['action'], actor.action, self.action_dim, scope_prefix=scope_prefix, log_tensorboard=log_tensorboard, log_params=log_params) return actor, critic def _compute_loss(self): with tf.name_scope('loss'): self.actor_loss = self._actor_loss() self.priority, self.critic_loss = self._critic_loss() self.loss = self.actor_loss + self.critic_loss def _actor_loss(self): with tf.name_scope('actor_loss'): return -tf.reduce_mean( self.data['IS_ratio'] * self.critic.Q1_with_actor) def _critic_loss(self): with tf.name_scope('critic_loss'): target_Q = n_step_target(self.data['reward'], self.data['done'], self.target_critic.Q_with_actor, self.gamma, self.data['steps']) Q1_error = tf.abs(target_Q - self.critic.Q1, name='Q1_error') Q2_error = tf.abs(target_Q - self.critic.Q2, name='Q2_error') loss_func = huber_loss if self.critic_loss_type == 'huber' else tf.square TD_squared = (loss_func(Q1_error) + loss_func(Q2_error)) critic_loss = tf.reduce_mean(self.data['IS_ratio'] * TD_squared) priority = self._compute_priority((Q1_error + Q2_error) / 2.) return priority, critic_loss def _target_net_ops(self): with tf.name_scope('target_net_op'): target_main_var_pairs = list( zip(self.target_variables, self.main_variables)) init_target_op = list( map(lambda v: tf.assign(v[0], v[1], name='init_target_op'), target_main_var_pairs)) update_target_op = list( map( lambda v: tf.assign(v[0], self.polyak * v[0] + (1. - self.polyak) * v[1], name='update_target_op'), target_main_var_pairs)) return init_target_op, update_target_op def _initialize_target_net(self): self.sess.run(self.init_target_op) def _update_target_net(self): self.sess.run(self.update_target_op) def _log_loss(self): if self.log_tensorboard: with tf.name_scope('info'): tf.compat.v1.summary.scalar('actor_loss_', self.actor_loss) tf.compat.v1.summary.scalar('critic_loss_', self.critic_loss) stats_summary('Q_with_actor', self.critic.Q_with_actor, max=True, hist=True) stats_summary('reward', self.data['reward'], min=True, hist=True) stats_summary('priority', self.priority, hist=True, max=True) def _get_feeddict(self, t): return { self.actor_lr: self.actor_lr_scheduler.value(t), self.critic_lr: self.critic_lr_scheduler.value(t) }
class Agent(OffPolicyOperation): """ Interface """ def __init__(self, name, args, env_args, buffer_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None): self.raw_temperature = args['temperature'] self.critic_loss_type = args['loss_type'] # learning rate schedule self.schedule_lr = 'schedule_lr' in args and args['schedule_lr'] if self.schedule_lr: self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (150000, 1e-4), (300000, 5e-5)], outside_value=5e-5) self.Q_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (150000, 3e-4), (300000, 1e-4)], outside_value=1e-4) self.alpha_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (150000, 1e-4), (300000, 5e-5)], outside_value=5e-5) super().__init__(name, args, env_args, buffer_args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device) @override(OffPolicyOperation) def _build_graph(self): if 'gpu' in self.device: with tf.device('/cpu: 0'): self.data = self._prepare_data(self.buffer) else: self.data = self._prepare_data(self.buffer) self.actor = self._actor() self._action_surrogate() self.critic = self._critic() if self.raw_temperature == 'auto': self.temperature = self._auto_temperature() self.alpha = self.temperature.alpha self.next_alpha = self.temperature.next_alpha else: # reward scaling indirectly affects the policy temperature # we neutralize the effect by scaling the temperature here # see my blog for more info https://xlnwel.github.io/blog/reinforcement%20learning/SAC/ self.alpha = self.raw_temperature * self.buffer.reward_scale self.next_alpha = self.alpha self._compute_loss() self._optimize() self._log_loss() def _actor(self): policy_args = self.args['Policy'] policy_args['max_action_repetitions'] = self.max_action_repetitions policy_args['polyak'] = self.args['polyak'] return SoftPolicy('SoftPolicy', policy_args, self.graph, self.data['state'], self.data['next_state'], self.action_dim, scope_prefix=self.name, log_tensorboard=self.log_tensorboard, log_params=self.log_params) def _action_surrogate(self): self.action = self.actor.action self.action_det = self.actor.action_det self.next_action = self.actor.next_action self.logpi = self.actor.logpi self.next_logpi = self.actor.next_logpi def _critic(self): q_args = self.args['Q'] q_args['polyak'] = self.args['polyak'] return SoftQ('SoftQ', q_args, self.graph, self.data['state'], self.data['next_state'], self.data['action'], self.action, self.next_action, scope_prefix=self.name, log_tensorboard=self.log_tensorboard, log_params=self.log_params) def _auto_temperature(self): return Temperature('Temperature', self.args['Temperature'], self.graph, self.data['state'], self.data['next_state'], self.action, self.next_action, scope_prefix=self.name, log_tensorboard=self.log_tensorboard, log_params=self.log_params) def _compute_loss(self): with tf.name_scope('loss'): if self.raw_temperature == 'auto': self.alpha_loss = self._alpha_loss() self.loss = self.alpha_loss else: self.loss = 0 self.actor_loss = self._actor_loss() self.priority, self.Q1_loss, self.Q2_loss, self.critic_loss = self._critic_loss( ) self.loss += self.actor_loss + self.critic_loss def _alpha_loss(self): target_entropy = -self.action_dim with tf.name_scope('alpha_loss'): return -tf.reduce_mean( self.data['IS_ratio'] * self.temperature.log_alpha * tf.stop_gradient(self.logpi + target_entropy)) def _actor_loss(self): with tf.name_scope('actor_loss'): return tf.reduce_mean( self.data['IS_ratio'] * (self.alpha * self.logpi - self.critic.Q1_with_actor)) def _critic_loss(self): with tf.name_scope('critic_loss'): n_V = tf.subtract(self.critic.next_Q_with_actor, self.next_alpha * self.next_logpi, name='n_V') target_Q = n_step_target(self.data['reward'], self.data['done'], n_V, self.gamma, self.data['steps']) Q1_error = tf.abs(target_Q - self.critic.Q1, name='Q1_error') Q2_error = tf.abs(target_Q - self.critic.Q2, name='Q2_error') Q1_loss = tf.reduce_mean(self.data['IS_ratio'] * Q1_error**2) Q2_loss = tf.reduce_mean(self.data['IS_ratio'] * Q2_error**2) critic_loss = Q1_loss + Q2_loss priority = self._compute_priority((Q1_error + Q2_error) / 2.) return priority, Q1_loss, Q2_loss, critic_loss def _optimize(self): with tf.name_scope('optimizer'): opt_ops = [] if self.raw_temperature == 'auto': _, self.alpha_lr, _, _, temp_op = self.temperature._optimization_op( self.alpha_loss, schedule_lr=self.schedule_lr) opt_ops.append(temp_op) _, self.actor_lr, self.opt_step, _, actor_opt_op = self.actor._optimization_op( self.actor_loss, opt_step=True, schedule_lr=self.schedule_lr) _, self.Q_lr, _, _, Q_opt_op = self.critic._optimization_op( self.critic_loss, schedule_lr=self.schedule_lr) opt_ops += [actor_opt_op, Q_opt_op] self.opt_op = tf.group(*opt_ops) @override(OffPolicyOperation) def _initialize_target_net(self): self.sess.run(self.actor.init_target_op + self.critic.init_target_op) @override(OffPolicyOperation) def _update_target_net(self): self.sess.run(self.actor.update_target_op + self.critic.update_target_op) @override(OffPolicyOperation) def _get_feeddict(self, t): return { self.actor_lr: self.actor_lr_scheduler.value(t), self.Q_lr: self.Q_lr_scheduler.value(t), self.alpha_lr: self.alpha_lr_scheduler.value(t) } def _log_loss(self): if self.log_tensorboard: with tf.name_scope('info'): stats_summary('reward', self.data['reward'], min=True, max=True, hist=True) with tf.name_scope('actor'): stats_summary('orig_action', self.actor.orig_action) stats_summary('entropy', self.actor.action_distribution.entropy()) stats_summary('action_std', self.actor.action_distribution.std) stats_summary('orig_logpi', self.actor.orig_logpi) tf.compat.v1.summary.scalar('orig_logpi_0', self.actor.orig_logpi[0][0]) stats_summary('action', self.actor.action) stats_summary('logpi', self.actor.logpi) tf.compat.v1.summary.scalar('actor_loss_', self.actor_loss) with tf.name_scope('critic'): stats_summary('Q1_with_actor', self.critic.Q1_with_actor, min=True, max=True) stats_summary('Q2_with_actor', self.critic.Q2_with_actor, min=True, max=True) if self.buffer_type == 'proportional': stats_summary('priority', self.priority, std=True, max=True, hist=True) tf.compat.v1.summary.scalar('Q1_loss_', self.Q1_loss) tf.compat.v1.summary.scalar('Q2_loss_', self.Q2_loss) tf.compat.v1.summary.scalar('critic_loss_', self.critic_loss) if self.raw_temperature == 'auto': with tf.name_scope('alpha'): stats_summary('alpha', self.alpha, std=True) tf.compat.v1.summary.scalar('alpha_loss', self.alpha_loss)
def __init__(self, name, args, env_args, sess_config=None, save=False, log=False, log_tensorboard=False, log_params=False, log_stats=False, device=None, reuse=None, graph=None): # hyperparameters self.gamma = args['gamma'] self.gae_discount = self.gamma * args['lam'] self.n_minibatches = args['n_minibatches'] self.use_lstm = args['ac']['use_lstm'] self.entropy_coef = args['ac']['entropy_coef'] self.n_value_updates = args['ac']['n_value_updates'] self.minibatch_idx = 0 # environment info self.env_vec = create_gym_env(env_args) self.seq_len = self.env_vec.max_episode_steps self.buffer = PPOBuffer(env_args['n_workers'] * env_args['n_envs'], self.seq_len, self.n_minibatches, self.env_vec.state_shape, np.float32, self.env_vec.action_shape, np.float32) super().__init__(name, args, sess_config=sess_config, save=save, log=log, log_tensorboard=log_tensorboard, log_params=log_params, log_stats=log_stats, device=device, reuse=reuse, graph=graph) self.schedule_lr = 'schedule_lr' in args and args['schedule_lr'] if self.schedule_lr: self.actor_lr_scheduler = PiecewiseSchedule([(0, 1e-4), (400000, 1e-4), (600000, 5e-5)], outside_value=5e-5) self.critic_lr_scheduler = PiecewiseSchedule([(0, 3e-4), (400000, 3e-4), (600000, 5e-5)], outside_value=5e-5) if self.use_lstm: # don't distinguish lstm at training from that at running # since training is done after running self.last_lstm_state = None with self.graph.as_default(): self.variables = TensorFlowVariables( [self.ac.policy_loss, self.ac.V_loss], self.sess)
class PrioritizedReplay(Replay): """ Interface """ def __init__(self, args, state_shape, action_dim): super().__init__(args, state_shape, action_dim) self.data_structure = None # params for prioritized replay self.alpha = float(args['alpha']) if 'alpha' in args else .5 self.beta = float(args['beta0']) if 'beta0' in args else .4 self.beta_schedule = PiecewiseSchedule([(0, args['beta0']), (float(args['beta_steps']), 1.)], outside_value=1.) self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4 self.top_priority = 2. self.to_update_priority = args['to_update_priority'] if 'to_update_priority' in args else True self.sample_i = 0 # count how many times self.sample is called init_buffer(self.memory, self.capacity, state_shape, action_dim, self.n_steps == 1) # Code for single agent if self.n_steps > 1: self.tb_capacity = args['tb_capacity'] self.tb_idx = 0 self.tb_full = False self.tb = {} init_buffer(self.tb, self.tb_capacity, state_shape, action_dim, True) @override(Replay) def sample(self): assert_colorize(self.good_to_learn, 'There are not sufficient transitions to start learning --- ' f'transitions in buffer: {len(self)}\t' f'minimum required size: {self.min_size}') with self.locker: samples = self._sample() self.sample_i += 1 self._update_beta() return samples @override(Replay) def add(self, state, action, reward, done): if self.n_steps > 1: self.tb['priority'][self.tb_idx] = self.top_priority else: self.memory['priority'][self.mem_idx] = self.top_priority self.data_structure.update(self.top_priority, self.mem_idx) super()._add(state, action, reward, done) def update_priorities(self, priorities, saved_mem_idxs): with self.locker: if self.to_update_priority: self.top_priority = max(self.top_priority, np.max(priorities)) for priority, mem_idx in zip(priorities, saved_mem_idxs): self.data_structure.update(priority, mem_idx) """ Implementation """ def _update_beta(self): self.beta = self.beta_schedule.value(self.sample_i) @override(Replay) def _merge(self, local_buffer, length): end_idx = self.mem_idx + length assert np.all(local_buffer['priority'][: length]) for idx, mem_idx in enumerate(range(self.mem_idx, end_idx)): self.data_structure.update(local_buffer['priority'][idx], mem_idx % self.capacity) super()._merge(local_buffer, length) def _compute_IS_ratios(self, probabilities): IS_ratios = (np.min(probabilities) / probabilities)**self.beta return IS_ratios
class PERBase(Replay): """ Base class for PER, left in case one day I implement rank-based PER """ def _add_attributes(self): super()._add_attributes() self._top_priority = 1. self._data_structure = None self._use_is_ratio = getattr(self, '_use_is_ratio', True) self._beta = float(getattr(self, 'beta0', .4)) if getattr(self, '_beta_schedule', None): assert isinstance(self._beta_schedule, list) self._beta_schedule = PiecewiseSchedule(self._beta_schedule) self._sample_i = 0 # count how many times self._sample is called @override(Replay) def sample(self, batch_size=None): assert self.good_to_learn(), ( 'There are not sufficient transitions to start learning --- ' f'transitions in buffer({len(self)}) vs ' f'minimum required size({self._min_size})') samples = self._sample(batch_size=batch_size) self._sample_i += 1 if hasattr(self, '_beta_schedule'): self._update_beta() return samples @override(Replay) def add(self, **kwargs): super().add(**kwargs) # super().add updates self._mem_idx if self._n_envs == 1: self._data_structure.update(self._mem_idx - 1, self._top_priority) def update_priorities(self, priorities, idxes): assert not np.any(np.isnan(priorities)), priorities np.testing.assert_array_less(0, priorities) if self._to_update_top_priority: self._top_priority = max(self._top_priority, np.max(priorities)) self._data_structure.batch_update(idxes, priorities) """ Implementation """ def _update_beta(self): self._beta = self._beta_schedule.value(self._sample_i) @override(Replay) def _merge(self, local_buffer, length): priority = local_buffer.pop('priority')[:length] \ if 'priority' in local_buffer else self._top_priority * np.ones(length) np.testing.assert_array_less(0, priority) # update sum tree mem_idxes = np.arange(self._mem_idx, self._mem_idx + length) % self._capacity self._data_structure.batch_update(mem_idxes, priority) # update memory super()._merge(local_buffer, length) def _compute_IS_ratios(self, probabilities): """ w = (N * p)**(-beta) max(w) = max(N * p)**(-beta) = (N * min(p))**(-beta) norm_w = w / max(w) = (N*p)**(-beta) / (N * min(p))**(-beta) = (min(p) / p)**beta """ IS_ratios = (np.min(probabilities) / probabilities)**self._beta return IS_ratios
class PrioritizedReplay(Replay): """ Interface """ def __init__(self, args, state_space, action_dim): super().__init__(args, state_space, action_dim) self.data_structure = None # params for prioritized replay self.alpha = float(args['alpha']) if 'alpha' in args else .5 self.beta = float(args['beta0']) if 'beta0' in args else .4 self.beta_schedule = PiecewiseSchedule( [(0, args['beta0']), (float(args['beta_steps']), 1.)], outside_value=1.) self.epsilon = float(args['epsilon']) if 'epsilon' in args else 1e-4 self.top_priority = 2. self.sample_i = 0 # count how many times self.sample is called @override(Replay) def sample(self): assert_colorize( self.good_to_learn, 'There are not sufficient transitions to start learning --- ' f'transitions in buffer: {len(self)}\t' f'minimum required size: {self.min_size}') with self.locker: samples = self._sample() self.sample_i += 1 self._update_beta() return samples @override(Replay) def add(self, state, action, reward, done): if self.n_steps > 1: self.tb['priority'][self.tb_idx] = self.top_priority else: self.memory['priority'][self.mem_idx] = self.top_priority super()._add(state, action, reward, done) def update_priorities(self, priorities, saved_mem_idxs): with self.locker: for priority, mem_idx in zip(priorities, saved_mem_idxs): self.data_structure.update(priority, mem_idx) """ Implementation """ def _update_beta(self): self.beta = self.beta_schedule.value(self.sample_i) @override(Replay) def _merge(self, local_buffer, length, start=0): end_idx = self.mem_idx + length for idx, mem_idx in enumerate(range(self.mem_idx, end_idx)): self.data_structure.update(local_buffer['priority'][idx], mem_idx % self.capacity) super()._merge(local_buffer, length, start) def _compute_IS_ratios(self, N, probabilities): IS_ratios = np.power(probabilities * N, -self.beta) IS_ratios /= np.max( IS_ratios) # normalize ratios to avoid scaling the update upward return IS_ratios