class ActorCriticAgent(MemoryAgent): """ Abstract class, unifies deep actor critic functionality Handles on_step callbacks, either updating current batch or executing one training step if the batch is ready Extending classes only need to implement loss_fn method """ def __init__( self, obs_spec: Spec, act_spec: Spec, model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) if not sess_mgr: sess_mgr = SessionManager() if not optimizer: optimizer = tf.train.AdamOptimizer( learning_rate=DEFAULTS['learning_rate']) self.sess_mgr = sess_mgr self.value_coef = value_coef self.entropy_coef = entropy_coef self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_returns = normalize_returns self.normalize_advantages = normalize_advantages self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn() grads, vars = zip(*optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm(grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step) self.minimize_ops = self.make_minimize_ops() sess_mgr.restore_or_init() self.n_batches = sess_mgr.start_step self.start_step = sess_mgr.start_step * traj_len self.logger = Logger() def get_action_and_value(self, obs): return self.sess_mgr.run([self.policy.sample, self.value], self.model.inputs, obs) def get_action(self, obs): return self.sess_mgr.run(self.policy.sample, self.model.inputs, obs) def on_step(self, step, obs, action, intrinsic_rew, game_reward, done, value=None): MemoryAgent.on_step(self, step, obs, action, intrinsic_rew, game_reward, done, value) self.logger.on_step(step, intrinsic_rew, game_reward, done) if not self.batch_ready(): return next_values = self.sess_mgr.run(self.value, self.model.inputs, self.last_obs) adv, returns = self.compute_advantages_and_returns(next_values) loss_terms, grads_norm = self.minimize(adv, returns) self.sess_mgr.on_update(self.n_batches) self.logger.on_update(self.n_batches, loss_terms, grads_norm, returns, adv, next_values) def minimize(self, advantages, returns): inputs = self.obs + self.acts + [advantages, returns] inputs = [a.reshape(-1, *a.shape[2:]) for a in inputs] tf_inputs = self.model.inputs + self.policy.inputs + self.loss_inputs loss_terms, grads_norm, *_ = self.sess_mgr.run(self.minimize_ops, tf_inputs, inputs) return loss_terms, grads_norm def compute_advantages_and_returns(self, bootstrap_value): """ GAE can help with reducing variance of policy gradient estimates """ if self.clip_rewards > 0.0: np.clip(self.rewards, -self.clip_rewards, self.clip_rewards, out=self.rewards) rewards = self.rewards.copy() rewards[-1] += (1 - self.dones[-1]) * self.discount * bootstrap_value masked_discounts = self.discount * (1 - self.dones) returns = self.discounted_cumsum(rewards, masked_discounts) if self.gae_lambda > 0.: values = np.append(self.values, np.expand_dims(bootstrap_value, 0), axis=0) # d_t = r_t + g * V(s_{t+1}) - V(s_t) deltas = self.rewards + masked_discounts * values[1:] - values[:-1] adv = self.discounted_cumsum(deltas, self.gae_lambda * masked_discounts) else: adv = returns - self.values if self.normalize_advantages: adv = (adv - adv.mean()) / (adv.std() + 1e-10) if self.normalize_returns: returns = (returns - returns.mean()) / (returns.std() + 1e-10) return adv, returns def on_start(self): self.logger.on_start() def on_finish(self): self.logger.on_finish() def make_minimize_ops(self): ops = [self.loss_terms, self.grads_norm] if self.sess_mgr.training_enabled: ops.append(self.train_op) # appending extra model update ops (e.g. running stats) # note: this will most likely break if model.compile() is used ops.extend(self.model.get_updates_for(None)) return ops @staticmethod def discounted_cumsum(x, discount): y = np.zeros_like(x) y[-1] = x[-1] for t in range(x.shape[0] - 2, -1, -1): y[t] = x[t] + discount[t] * y[t + 1] return y @abstractmethod def loss_fn(self): ...
class ActorCriticAgent(MemoryAgent): """ Abstract class, unifies deep actor critic functionality Handles on_step callbacks, either updating current batch or executing one training step if the batch is ready Extending classes only need to implement loss_fn method """ def __init__( self, obs_spec: Spec, act_spec: Spec, model_variable_scope=DEFAULTS['model_variable_scope'], model_fn: ModelBuilder = None, policy_cls: PolicyType = None, sess_mgr: SessionManager = None, optimizer: tf.train.Optimizer = None, value_coef=DEFAULTS['value_coef'], entropy_coef=DEFAULTS['entropy_coef'], traj_len=DEFAULTS['traj_len'], batch_sz=DEFAULTS['batch_sz'], discount=DEFAULTS['discount'], gae_lambda=DEFAULTS['gae_lambda'], clip_rewards=DEFAULTS['clip_rewards'], clip_grads_norm=DEFAULTS['clip_grads_norm'], normalize_returns=DEFAULTS['normalize_returns'], normalize_advantages=DEFAULTS['normalize_advantages'], **kwargs, ): MemoryAgent.__init__(self, obs_spec, act_spec, traj_len, batch_sz) print(LOGGING_MSG_HEADER + ": the traj_len is {} and batch_sz is {}".format( traj_len, batch_sz)) if not sess_mgr: sess_mgr = SessionManager() self.subenvs = subenvs = kwargs[ 'subenvs'] if 'subenvs' in kwargs else [] if optimizer: optimizers = [copy.deepcopy(optimizer) for subenv in subenvs] else: optimizer = tf.train.AdamOptimizer( learning_rate=DEFAULTS['learning_rate']) optimizers = [ tf.train.AdamOptimizer(learning_rate=DEFAULTS['learning_rate']) for subenv in subenvs ] self.sess_mgr = sess_mgr self.model_variable_scope = self.sess_mgr.model_variable_scope self.value_coef = value_coef self.entropy_coef = entropy_coef self.discount = discount self.gae_lambda = gae_lambda self.clip_rewards = clip_rewards self.normalize_returns = normalize_returns self.normalize_advantages = normalize_advantages self.traj_len = traj_len self.batch_sz = batch_sz print(LOGGING_MSG_HEADER + " : the current model_variable_scope is", self.model_variable_scope) # implement the a2c to support multiple subagents # self.model = model_fn(obs_spec, act_spec) with sess_mgr.sess.graph.as_default(): # note this is name_scope as opposed to variable_scope, important with tf.name_scope(self.sess_mgr.main_tf_vs.original_name_scope): if subenvs: from collections import defaultdict self.subenv_dict = defaultdict(list) print( LOGGING_MSG_HEADER + ": Creating models for each individual subenvs: ", subenvs) for i, subenv in enumerate(subenvs): subenv_model = model_fn(obs_spec, act_spec) self.subenv_dict['models'].append(subenv_model) subenv_value = subenv_model.outputs[-1] self.subenv_dict['values'].append(subenv_value) subenv_policy = policy_cls(act_spec, subenv_model.outputs[:-1]) self.subenv_dict['policies'].append(subenv_policy) subenv_loss_op, subenv_loss_terms, subenv_loss_inputs = self.loss_fn( policy=subenv_policy, value=subenv_value) self.subenv_dict['loss_ops'].append(subenv_loss_op) self.subenv_dict['loss_terms'].append( subenv_loss_terms) self.subenv_dict['loss_inputs'].append( subenv_loss_inputs) subenv_optimizer = optimizers[i] grads, vars = zip(*subenv_optimizer.compute_gradients( subenv_loss_op)) subenv_grads_norm = tf.global_norm(grads) self.subenv_dict['grads_norms'].append( subenv_grads_norm) if clip_grads_norm > 0: grads, _ = tf.clip_by_global_norm( grads, clip_grads_norm, subenv_grads_norm) self.subenv_dict['train_ops'].append( subenv_optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step)) self.subenv_dict['minimize_ops'].append( self.make_minimize_ops(subenv_id=i)) print( LOGGING_MSG_HEADER + ": Successfully created models for each individual subenvs" ) else: print(LOGGING_MSG_HEADER + ": Creating single model for the environment.") self.model = model_fn(obs_spec, act_spec) self.value = self.model.outputs[-1] self.policy = policy_cls(act_spec, self.model.outputs[:-1]) self.loss_op, self.loss_terms, self.loss_inputs = self.loss_fn( ) grads, vars = zip( *optimizer.compute_gradients(self.loss_op)) self.grads_norm = tf.global_norm(grads) if clip_grads_norm > 0.: grads, _ = tf.clip_by_global_norm( grads, clip_grads_norm, self.grads_norm) self.train_op = optimizer.apply_gradients( zip(grads, vars), global_step=sess_mgr.global_step) self.minimize_ops = self.make_minimize_ops() print(LOGGING_MSG_HEADER + " : main_model setup on sess and graph complete") sess_mgr.restore_or_init() print(LOGGING_MSG_HEADER + " : main_model weights restore/init complete") self.n_batches = sess_mgr.start_step self.start_step = sess_mgr.start_step * traj_len self.logger = Logger() def get_action_and_value(self, obs, subenv_id=None): if self.subenvs and subenv_id is not None: return self.sess_mgr.run([ self.subenv_dict['policies'][subenv_id].sample, self.subenv_dict['values'][subenv_id] ], self.subenv_dict['models'][subenv_id].inputs, obs) else: return self.sess_mgr.run([self.policy.sample, self.value], self.model.inputs, obs) def get_action(self, obs, subenv_id=None): if self.subenvs and subenv_id is not None: return self.sess_mgr.run( self.subenv_dict['policies'][subenv_id].sample, self.subenv_dict['models'][subenv_id].inputs, obs) else: return self.sess_mgr.run(self.policy.sample, self.model.inputs, obs) def on_step(self, step, obs, action, reward, done, value=None, subenv_id=None): MemoryAgent.on_step(self, step, obs, action, reward, done, value) self.logger.on_step(step, reward, done) if not self.batch_ready(): return if self.subenvs and subenv_id is not None: assert self.subenv_dict, "Missing subenv_dict implementation" next_values = self.sess_mgr.run( self.subenv_dict['values'][subenv_id], self.subenv_dict['models'][subenv_id].inputs, self.last_obs) else: next_values = self.sess_mgr.run(self.value, self.model.inputs, self.last_obs) adv, returns = self.compute_advantages_and_returns(next_values) loss_terms, grads_norm = self.minimize(adv, returns, subenv_id=subenv_id) self.sess_mgr.on_update(self.n_batches) logs = self.logger.on_update(self.n_batches, loss_terms, grads_norm, returns, adv, next_values) return logs def minimize(self, advantages, returns, subenv_id=None): inputs = self.obs + self.acts + [advantages, returns] inputs = [a.reshape(-1, *a.shape[2:]) for a in inputs] if self.subenvs and subenv_id is not None: assert self.subenv_dict, "Missing subenv_dict implementation" tf_inputs = self.subenv_dict['models'][ subenv_id].inputs + self.subenv_dict['policies'][ subenv_id].inputs + self.subenv_dict['loss_inputs'][ subenv_id] loss_terms, grads_norm, * \ _ = self.sess_mgr.run(self.subenv_dict['minimize_ops'][subenv_id], tf_inputs, inputs) else: tf_inputs = self.model.inputs + self.policy.inputs + self.loss_inputs loss_terms, grads_norm, * \ _ = self.sess_mgr.run(self.minimize_ops, tf_inputs, inputs) return loss_terms, grads_norm def compute_advantages_and_returns(self, bootstrap_value): """ GAE can help with reducing variance of policy gradient estimates """ if self.clip_rewards > 0.0: np.clip(self.rewards, -self.clip_rewards, self.clip_rewards, out=self.rewards) rewards = self.rewards.copy() rewards[-1] += (1 - self.dones[-1]) * self.discount * bootstrap_value masked_discounts = self.discount * (1 - self.dones) returns = self.discounted_cumsum(rewards, masked_discounts) if self.gae_lambda > 0.: values = np.append(self.values, np.expand_dims(bootstrap_value, 0), axis=0) # d_t = r_t + g * V(s_{t+1}) - V(s_t) deltas = self.rewards + masked_discounts * values[1:] - values[:-1] adv = self.discounted_cumsum(deltas, self.gae_lambda * masked_discounts) else: adv = returns - self.values if self.normalize_advantages: adv = (adv - adv.mean()) / (adv.std() + 1e-10) if self.normalize_returns: returns = (returns - returns.mean()) / (returns.std() + 1e-10) return adv, returns def on_start(self): self.logger.on_start() def on_finish(self): self.logger.on_finish() def reset(self): """ Introduced for HRL with multiple subenvs trained in sequence So need to reset some auxiliary logging book-keeping information """ MemoryAgent.__init__(self, obs_spec=self.obs_spec, act_spec=self.act_spec, traj_len=self.traj_len, batch_sz=self.batch_sz) self.logger.reset() def make_minimize_ops(self, subenv_id=None): if self.subenvs and subenv_id is not None: assert self.subenv_dict, "self.subenv_dict is None or empty" loss_terms = self.subenv_dict['loss_terms'][subenv_id] grads_norm = self.subenv_dict['grads_norms'][subenv_id] ops = [loss_terms, grads_norm] if self.sess_mgr.training_enabled: ops.append(self.subenv_dict['train_ops'][subenv_id]) return ops else: ops = [self.loss_terms, self.grads_norm] if self.sess_mgr.training_enabled: ops.append(self.train_op) # appending extra model update ops (e.g. running stats) # note: this will most likely break if model.compile() is used ops.extend(self.model.get_updates_for(None)) return ops @staticmethod def discounted_cumsum(x, discount): y = np.zeros_like(x) y[-1] = x[-1] for t in range(x.shape[0] - 2, -1, -1): y[t] = x[t] + discount[t] * y[t + 1] return y @abstractmethod def loss_fn(self): ...