def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, value_fn_arch=relu_net, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='airl'): super(AIRL, self).__init__() if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None env_spec = env.spec self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env_spec.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None self.set_demos(expert_trajs) self.state_only = state_only self.max_itrs = max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('reward'): self.reward = reward_arch(rew_input, dout=1, **reward_arch_args) #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # value function shaping with tf.variable_scope('vfn'): fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t, dout=1) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn = self.reward + self.gamma * fitted_value_fn_n log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau - log_pq) cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.loss = cent_loss tot_loss = self.loss self.step = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(tot_loss) self._make_param_ops(_vs)
def __init__( self, *, env_spec, # No good default, but we do need to have it expert_trajs=None, reward_arch=cnn_net, reward_arch_args={}, value_fn_arch=cnn_net, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='airl', drop_framestack=False, only_show_scores=False, rescore_expert_trajs=True, encoder_loc=None): super(AIRL, self).__init__() # Write down everything that we're going to need in order to restore # this. All of these arguments are serializable, so it's pretty easy self.init_args = dict(model=AtariAIRL, env_spec=env_spec, expert_trajs=expert_trajs, reward_arch=reward_arch, reward_arch_args=reward_arch_args, value_fn_arch=value_fn_arch, score_discrim=score_discrim, discount=discount, state_only=state_only, max_itrs=max_itrs, fusion=fusion, name=name, rescore_expert_trajs=rescore_expert_trajs, drop_framestack=drop_framestack, only_show_scores=only_show_scores, encoder_loc=encoder_loc) self.encoder = None if not encoder_loc else encoding.VariationalAutoEncoder.load( encoder_loc) self.encode_fn = None if self.encoder: if state_only: self.encode_fn = self.encoder.base_vector else: self.encode_fn = self.encoder.encode if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None if self.encoder: self.dO = self.encoder.encoding_shape self.dOshape = self.encoder.encoding_shape else: self.dO = env_spec.observation_space.flat_dim self.dOshape = env_spec.observation_space.shape if drop_framestack: assert len(self.dOshape) == 3 self.dOshape = (*self.dOshape[:-1], 1) self.dU = env_spec.action_space.flat_dim assert isinstance(env_spec.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None #self.set_demos(expert_trajs) self.expert_trajs = expert_trajs self.state_only = state_only self.max_itrs = max_itrs self.drop_framestack = drop_framestack self.only_show_scores = only_show_scores self.expert_cache = None self.rescore_expert_trajs = rescore_expert_trajs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU obs_dtype = tf.int8 if reward_arch == cnn_net else tf.float32 self.obs_t = tf.placeholder(obs_dtype, list((None, ) + self.dOshape), name='obs') self.nobs_t = tf.placeholder(obs_dtype, list((None, ) + self.dOshape), name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t with tf.variable_scope('reward'): if self.state_only: self.reward = reward_arch(rew_input, dout=1, **reward_arch_args) else: print("Not state only", self.act_t) self.reward = reward_arch(rew_input, actions=self.act_t, dout=1, **reward_arch_args) # value function shaping with tf.variable_scope('vfn'): fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t, dout=1) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn = self.reward + self.gamma * fitted_value_fn_n log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau - log_pq) self.accuracy, self.update_accuracy = tf.metrics.accuracy( labels=self.labels, predictions=self.discrim_output > 0.5) self.loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.step = tf.train.AdamOptimizer(learning_rate=self.lr).minimize( self.loss) self._make_param_ops(_vs) self.grad_reward = tf.gradients(self.reward, [self.obs_t, self.act_t]) self.modify_obs = self.get_ablation_modifiers() self.score_mean = 0 self.score_std = 1
class AIRL(SingleTimestepIRL): """ Args: fusion (bool): Use trajectories from old iterations to train. state_only (bool): Fix the learned reward to only depend on state. score_discrim (bool): Use log D - log 1-D as reward (if true you should not need to use an entropy bonus) max_itrs (int): Number of training iterations to run per fit step. """ def __init__(self, env, expert_trajs=None, reward_arch=relu_net, reward_arch_args=None, value_fn_arch=relu_net, score_discrim=False, discount=1.0, state_only=False, max_itrs=100, fusion=False, name='airl'): super(AIRL, self).__init__() if reward_arch_args is None: reward_arch_args = {} if fusion: self.fusion = RamFusionDistr(100, subsample_ratio=0.5) else: self.fusion = None env_spec = env.spec self.dO = env_spec.observation_space.flat_dim self.dU = env_spec.action_space.flat_dim assert isinstance(env_spec.action_space, Box) self.score_discrim = score_discrim self.gamma = discount assert value_fn_arch is not None self.set_demos(expert_trajs) self.state_only = state_only self.max_itrs = max_itrs # build energy model with tf.variable_scope(name) as _vs: # Should be batch_size x T x dO/dU self.obs_t = tf.placeholder(tf.float32, [None, self.dO], name='obs') self.nobs_t = tf.placeholder(tf.float32, [None, self.dO], name='nobs') self.act_t = tf.placeholder(tf.float32, [None, self.dU], name='act') self.nact_t = tf.placeholder(tf.float32, [None, self.dU], name='nact') self.labels = tf.placeholder(tf.float32, [None, 1], name='labels') self.lprobs = tf.placeholder(tf.float32, [None, 1], name='log_probs') self.lr = tf.placeholder(tf.float32, (), name='lr') with tf.variable_scope('discrim') as dvs: rew_input = self.obs_t if not self.state_only: rew_input = tf.concat([self.obs_t, self.act_t], axis=1) with tf.variable_scope('reward'): self.reward = reward_arch(rew_input, dout=1, **reward_arch_args) #energy_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) # value function shaping with tf.variable_scope('vfn'): fitted_value_fn_n = value_fn_arch(self.nobs_t, dout=1) with tf.variable_scope('vfn', reuse=True): self.value_fn = fitted_value_fn = value_fn_arch(self.obs_t, dout=1) # Define log p_tau(a|s) = r + gamma * V(s') - V(s) self.qfn = self.reward + self.gamma * fitted_value_fn_n log_p_tau = self.reward + self.gamma * fitted_value_fn_n - fitted_value_fn log_q_tau = self.lprobs log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0) self.discrim_output = tf.exp(log_p_tau - log_pq) cent_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) + (1 - self.labels) * (log_q_tau - log_pq)) self.loss = cent_loss tot_loss = self.loss self.step = tf.train.AdamOptimizer( learning_rate=self.lr).minimize(tot_loss) self._make_param_ops(_vs) def fit(self, paths, policy=None, batch_size=32, logger=None, lr=1e-3, **kwargs): if self.fusion is not None: old_paths = self.fusion.sample_paths(n=len(paths)) self.fusion.add_paths(paths) paths = paths + old_paths # eval samples under current policy self._compute_path_probs(paths, insert=True) # eval expert log probs under current policy self.eval_expert_probs(self.expert_trajs, policy, insert=True) self._insert_next_state(paths) self._insert_next_state(self.expert_trajs) obs, obs_next, acts, acts_next, path_probs = \ self.extract_paths(paths, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) expert_obs, expert_obs_next, expert_acts, expert_acts_next, expert_probs = \ self.extract_paths(self.expert_trajs, keys=('observations', 'observations_next', 'actions', 'actions_next', 'a_logprobs')) # Train discriminator for it in TrainingIterator(self.max_itrs, heartbeat=5): nobs_batch, obs_batch, nact_batch, act_batch, lprobs_batch = \ self.sample_batch(obs_next, obs, acts_next, acts, path_probs, batch_size=batch_size) nexpert_obs_batch, expert_obs_batch, nexpert_act_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs_next, expert_obs, expert_acts_next, expert_acts, expert_probs, batch_size=batch_size) # Build feed dict labels = np.zeros((batch_size * 2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) nobs_batch = np.concatenate([nobs_batch, nexpert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) nact_batch = np.concatenate([nact_batch, nexpert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate( [lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) feed_dict = { self.act_t: act_batch, self.obs_t: obs_batch, self.nobs_t: nobs_batch, self.nact_t: nact_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr } loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict=feed_dict) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: logger.record_tabular('GCLDiscrimLoss', mean_loss) #obs_next = np.r_[obs_next, np.expand_dims(obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run( [self.reward, self.value_fn, self.discrim_output], feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.nact_t: acts_next, self.lprobs: np.expand_dims(path_probs, axis=1) }) energy = -energy logger.record_tabular('GCLLogZ', np.mean(logZ)) logger.record_tabular('GCLAverageEnergy', np.mean(energy)) logger.record_tabular('GCLAverageLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('GCLMedianLogQtau', np.median(path_probs)) logger.record_tabular('GCLAverageDtau', np.mean(dtau)) #expert_obs_next = np.r_[expert_obs_next, np.expand_dims(expert_obs_next[-1], axis=0)] energy, logZ, dtau = tf.get_default_session().run( [self.reward, self.value_fn, self.discrim_output], feed_dict={ self.act_t: expert_acts, self.obs_t: expert_obs, self.nobs_t: expert_obs_next, self.nact_t: expert_acts_next, self.lprobs: np.expand_dims(expert_probs, axis=1) }) energy = -energy logger.record_tabular('GCLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('GCLAverageExpertLogPtau', np.mean(-energy - logZ)) logger.record_tabular('GCLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('GCLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('GCLAverageExpertDtau', np.mean(dtau)) return mean_loss def eval(self, paths, **kwargs): """ Return bonus """ if self.score_discrim: self._compute_path_probs(paths, insert=True) obs, obs_next, acts, path_probs = self.extract_paths( paths, keys=('observations', 'observations_next', 'actions', 'a_logprobs')) path_probs = np.expand_dims(path_probs, axis=1) scores = tf.get_default_session().run(self.discrim_output, feed_dict={ self.act_t: acts, self.obs_t: obs, self.nobs_t: obs_next, self.lprobs: path_probs }) test1 = np.log(1 - scores) score = np.log(scores) - np.log(1 - scores) score = score[:, 0] else: obs, acts = self.extract_paths(paths) reward = tf.get_default_session().run(self.reward, feed_dict={ self.act_t: acts, self.obs_t: obs }) score = reward[:, 0] return self.unpack(score, paths) def eval_single(self, obs): reward = tf.get_default_session().run(self.reward, feed_dict={self.obs_t: obs}) score = reward[:, 0] return score def debug_eval(self, paths, **kwargs): obs, acts = self.extract_paths(paths) reward, v, qfn = tf.get_default_session().run( [self.reward, self.value_fn, self.qfn], feed_dict={ self.act_t: acts, self.obs_t: obs }) return { 'reward': reward, 'value': v, 'qfn': qfn, }